LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
67 AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
86 AMDGPU::OpName OpName) {
87 unsigned Opc0 = N0->getMachineOpcode();
88 unsigned Opc1 = N1->getMachineOpcode();
89
90 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
91 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
92
93 if (Op0Idx == -1 && Op1Idx == -1)
94 return true;
95
96
97 if ((Op0Idx == -1 && Op1Idx != -1) ||
98 (Op1Idx == -1 && Op0Idx != -1))
99 return false;
100
101 // getNamedOperandIdx returns the index for the MachineInstr's operands,
102 // which includes the result as the first operand. We are indexing into the
103 // MachineSDNode's operands, so we need to skip the result operand to get
104 // the real index.
105 --Op0Idx;
106 --Op1Idx;
107
108 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
109}
110
111static bool canRemat(const MachineInstr &MI) {
112
116 return true;
117
118 if (SIInstrInfo::isSMRD(MI)) {
119 return !MI.memoperands_empty() &&
120 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
121 return MMO->isLoad() && MMO->isInvariant();
122 });
123 }
124
125 return false;
126}
127
129 const MachineInstr &MI) const {
130
131 if (canRemat(MI)) {
132 // Normally VALU use of exec would block the rematerialization, but that
133 // is OK in this case to have an implicit exec read as all VALU do.
134 // We really want all of the generic logic for this except for this.
135
136 // Another potential implicit use is mode register. The core logic of
137 // the RA will not attempt rematerialization if mode is set anywhere
138 // in the function, otherwise it is safe since mode is not changed.
139
140 // There is difference to generic method which does not allow
141 // rematerialization if there are virtual register uses. We allow this,
142 // therefore this method includes SOP instructions as well.
143 if (!MI.hasImplicitDef() &&
144 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
145 !MI.mayRaiseFPException())
146 return true;
147 }
148
150}
151
152// Returns true if the scalar result of a VALU instruction depends on exec.
153bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
154 // Ignore comparisons which are only used masked with exec.
155 // This allows some hoisting/sinking of VALU comparisons.
156 if (MI.isCompare()) {
157 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
158 if (!Dst)
159 return true;
160
161 Register DstReg = Dst->getReg();
162 if (!DstReg.isVirtual())
163 return true;
164
165 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
166 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
167 switch (Use.getOpcode()) {
168 case AMDGPU::S_AND_SAVEEXEC_B32:
169 case AMDGPU::S_AND_SAVEEXEC_B64:
170 break;
171 case AMDGPU::S_AND_B32:
172 case AMDGPU::S_AND_B64:
173 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
174 return true;
175 break;
176 default:
177 return true;
178 }
179 }
180 return false;
181 }
182
183 switch (MI.getOpcode()) {
184 default:
185 break;
186 case AMDGPU::V_READFIRSTLANE_B32:
187 return true;
188 }
189
190 return false;
191}
192
194 // Any implicit use of exec by VALU is not a real register read.
195 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
196 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
197}
198
200 MachineBasicBlock *SuccToSinkTo,
201 MachineCycleInfo *CI) const {
202 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
203 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
204 return true;
205
206 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
207 // Check if sinking of MI would create temporal divergent use.
208 for (auto Op : MI.uses()) {
209 if (Op.isReg() && Op.getReg().isVirtual() &&
210 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
211 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
212
213 // SgprDef defined inside cycle
214 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
215 if (FromCycle == nullptr)
216 continue;
217
218 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
219 // Check if there is a FromCycle that contains SgprDef's basic block but
220 // does not contain SuccToSinkTo and also has divergent exit condition.
221 while (FromCycle && !FromCycle->contains(ToCycle)) {
223 FromCycle->getExitingBlocks(ExitingBlocks);
224
225 // FromCycle has divergent exit condition.
226 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
227 if (hasDivergentBranch(ExitingBlock))
228 return false;
229 }
230
231 FromCycle = FromCycle->getParentCycle();
232 }
233 }
234 }
235
236 return true;
237}
238
240 int64_t &Offset0,
241 int64_t &Offset1) const {
242 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
243 return false;
244
245 unsigned Opc0 = Load0->getMachineOpcode();
246 unsigned Opc1 = Load1->getMachineOpcode();
247
248 // Make sure both are actually loads.
249 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
250 return false;
251
252 // A mayLoad instruction without a def is not a load. Likely a prefetch.
253 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
254 return false;
255
256 if (isDS(Opc0) && isDS(Opc1)) {
257
258 // FIXME: Handle this case:
259 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
260 return false;
261
262 // Check base reg.
263 if (Load0->getOperand(0) != Load1->getOperand(0))
264 return false;
265
266 // Skip read2 / write2 variants for simplicity.
267 // TODO: We should report true if the used offsets are adjacent (excluded
268 // st64 versions).
269 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
270 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
271 if (Offset0Idx == -1 || Offset1Idx == -1)
272 return false;
273
274 // XXX - be careful of dataless loads
275 // getNamedOperandIdx returns the index for MachineInstrs. Since they
276 // include the output in the operand list, but SDNodes don't, we need to
277 // subtract the index by one.
278 Offset0Idx -= get(Opc0).NumDefs;
279 Offset1Idx -= get(Opc1).NumDefs;
280 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
281 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
282 return true;
283 }
284
285 if (isSMRD(Opc0) && isSMRD(Opc1)) {
286 // Skip time and cache invalidation instructions.
287 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
288 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
289 return false;
290
291 unsigned NumOps = getNumOperandsNoGlue(Load0);
292 if (NumOps != getNumOperandsNoGlue(Load1))
293 return false;
294
295 // Check base reg.
296 if (Load0->getOperand(0) != Load1->getOperand(0))
297 return false;
298
299 // Match register offsets, if both register and immediate offsets present.
300 assert(NumOps == 4 || NumOps == 5);
301 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
302 return false;
303
304 const ConstantSDNode *Load0Offset =
306 const ConstantSDNode *Load1Offset =
308
309 if (!Load0Offset || !Load1Offset)
310 return false;
311
312 Offset0 = Load0Offset->getZExtValue();
313 Offset1 = Load1Offset->getZExtValue();
314 return true;
315 }
316
317 // MUBUF and MTBUF can access the same addresses.
318 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
319
320 // MUBUF and MTBUF have vaddr at different indices.
321 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
323 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
324 return false;
325
326 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
327 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
328
329 if (OffIdx0 == -1 || OffIdx1 == -1)
330 return false;
331
332 // getNamedOperandIdx returns the index for MachineInstrs. Since they
333 // include the output in the operand list, but SDNodes don't, we need to
334 // subtract the index by one.
335 OffIdx0 -= get(Opc0).NumDefs;
336 OffIdx1 -= get(Opc1).NumDefs;
337
338 SDValue Off0 = Load0->getOperand(OffIdx0);
339 SDValue Off1 = Load1->getOperand(OffIdx1);
340
341 // The offset might be a FrameIndexSDNode.
342 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
343 return false;
344
345 Offset0 = Off0->getAsZExtVal();
346 Offset1 = Off1->getAsZExtVal();
347 return true;
348 }
349
350 return false;
351}
352
353static bool isStride64(unsigned Opc) {
354 switch (Opc) {
355 case AMDGPU::DS_READ2ST64_B32:
356 case AMDGPU::DS_READ2ST64_B64:
357 case AMDGPU::DS_WRITE2ST64_B32:
358 case AMDGPU::DS_WRITE2ST64_B64:
359 return true;
360 default:
361 return false;
362 }
363}
364
367 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
368 const TargetRegisterInfo *TRI) const {
369 if (!LdSt.mayLoadOrStore())
370 return false;
371
372 unsigned Opc = LdSt.getOpcode();
373 OffsetIsScalable = false;
374 const MachineOperand *BaseOp, *OffsetOp;
375 int DataOpIdx;
376
377 if (isDS(LdSt)) {
378 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
379 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
380 if (OffsetOp) {
381 // Normal, single offset LDS instruction.
382 if (!BaseOp) {
383 // DS_CONSUME/DS_APPEND use M0 for the base address.
384 // TODO: find the implicit use operand for M0 and use that as BaseOp?
385 return false;
386 }
387 BaseOps.push_back(BaseOp);
388 Offset = OffsetOp->getImm();
389 // Get appropriate operand, and compute width accordingly.
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
391 if (DataOpIdx == -1)
392 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
393 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
394 Width = LocationSize::precise(64);
395 else
396 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
397 } else {
398 // The 2 offset instructions use offset0 and offset1 instead. We can treat
399 // these as a load with a single offset if the 2 offsets are consecutive.
400 // We will use this for some partially aligned loads.
401 const MachineOperand *Offset0Op =
402 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
403 const MachineOperand *Offset1Op =
404 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
405
406 unsigned Offset0 = Offset0Op->getImm() & 0xff;
407 unsigned Offset1 = Offset1Op->getImm() & 0xff;
408 if (Offset0 + 1 != Offset1)
409 return false;
410
411 // Each of these offsets is in element sized units, so we need to convert
412 // to bytes of the individual reads.
413
414 unsigned EltSize;
415 if (LdSt.mayLoad())
416 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
417 else {
418 assert(LdSt.mayStore());
419 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
421 }
422
423 if (isStride64(Opc))
424 EltSize *= 64;
425
426 BaseOps.push_back(BaseOp);
427 Offset = EltSize * Offset0;
428 // Get appropriate operand(s), and compute width accordingly.
429 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
430 if (DataOpIdx == -1) {
431 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
432 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
433 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
434 Width = LocationSize::precise(
435 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
436 } else {
437 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
438 }
439 }
440 return true;
441 }
442
443 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
444 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
445 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
446 return false;
447 BaseOps.push_back(RSrc);
448 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
449 if (BaseOp && !BaseOp->isFI())
450 BaseOps.push_back(BaseOp);
451 const MachineOperand *OffsetImm =
452 getNamedOperand(LdSt, AMDGPU::OpName::offset);
453 Offset = OffsetImm->getImm();
454 const MachineOperand *SOffset =
455 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
456 if (SOffset) {
457 if (SOffset->isReg())
458 BaseOps.push_back(SOffset);
459 else
460 Offset += SOffset->getImm();
461 }
462 // Get appropriate operand, and compute width accordingly.
463 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
464 if (DataOpIdx == -1)
465 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
466 if (DataOpIdx == -1) // LDS DMA
467 return false;
468 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
469 return true;
470 }
471
472 if (isImage(LdSt)) {
473 auto RsrcOpName =
474 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
475 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
476 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
477 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
478 if (VAddr0Idx >= 0) {
479 // GFX10 possible NSA encoding.
480 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
481 BaseOps.push_back(&LdSt.getOperand(I));
482 } else {
483 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
484 }
485 Offset = 0;
486 // Get appropriate operand, and compute width accordingly.
487 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
488 if (DataOpIdx == -1)
489 return false; // no return sampler
490 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
491 return true;
492 }
493
494 if (isSMRD(LdSt)) {
495 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
496 if (!BaseOp) // e.g. S_MEMTIME
497 return false;
498 BaseOps.push_back(BaseOp);
499 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
500 Offset = OffsetOp ? OffsetOp->getImm() : 0;
501 // Get appropriate operand, and compute width accordingly.
502 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
503 if (DataOpIdx == -1)
504 return false;
505 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
506 return true;
507 }
508
509 if (isFLAT(LdSt)) {
510 // Instructions have either vaddr or saddr or both or none.
511 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
512 if (BaseOp)
513 BaseOps.push_back(BaseOp);
514 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
515 if (BaseOp)
516 BaseOps.push_back(BaseOp);
517 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
518 // Get appropriate operand, and compute width accordingly.
519 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
520 if (DataOpIdx == -1)
521 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
522 if (DataOpIdx == -1) // LDS DMA
523 return false;
524 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
525 return true;
526 }
527
528 return false;
529}
530
531static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
533 const MachineInstr &MI2,
535 // Only examine the first "base" operand of each instruction, on the
536 // assumption that it represents the real base address of the memory access.
537 // Other operands are typically offsets or indices from this base address.
538 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
539 return true;
540
541 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
542 return false;
543
544 auto *MO1 = *MI1.memoperands_begin();
545 auto *MO2 = *MI2.memoperands_begin();
546 if (MO1->getAddrSpace() != MO2->getAddrSpace())
547 return false;
548
549 const auto *Base1 = MO1->getValue();
550 const auto *Base2 = MO2->getValue();
551 if (!Base1 || !Base2)
552 return false;
553 Base1 = getUnderlyingObject(Base1);
554 Base2 = getUnderlyingObject(Base2);
555
556 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
557 return false;
558
559 return Base1 == Base2;
560}
561
563 int64_t Offset1, bool OffsetIsScalable1,
565 int64_t Offset2, bool OffsetIsScalable2,
566 unsigned ClusterSize,
567 unsigned NumBytes) const {
568 // If the mem ops (to be clustered) do not have the same base ptr, then they
569 // should not be clustered
570 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
571 if (!BaseOps1.empty() && !BaseOps2.empty()) {
572 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
573 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
574 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
575 return false;
576
577 const SIMachineFunctionInfo *MFI =
578 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
579 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
580 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
581 // If only one base op is empty, they do not have the same base ptr
582 return false;
583 }
584
585 // In order to avoid register pressure, on an average, the number of DWORDS
586 // loaded together by all clustered mem ops should not exceed
587 // MaxMemoryClusterDWords. This is an empirical value based on certain
588 // observations and performance related experiments.
589 // The good thing about this heuristic is - it avoids clustering of too many
590 // sub-word loads, and also avoids clustering of wide loads. Below is the
591 // brief summary of how the heuristic behaves for various `LoadSize` when
592 // MaxMemoryClusterDWords is 8.
593 //
594 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
595 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
596 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
597 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
598 // (5) LoadSize >= 17: do not cluster
599 const unsigned LoadSize = NumBytes / ClusterSize;
600 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
601 return NumDWords <= MaxMemoryClusterDWords;
602}
603
604// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
605// the first 16 loads will be interleaved with the stores, and the next 16 will
606// be clustered as expected. It should really split into 2 16 store batches.
607//
608// Loads are clustered until this returns false, rather than trying to schedule
609// groups of stores. This also means we have to deal with saying different
610// address space loads should be clustered, and ones which might cause bank
611// conflicts.
612//
613// This might be deprecated so it might not be worth that much effort to fix.
615 int64_t Offset0, int64_t Offset1,
616 unsigned NumLoads) const {
617 assert(Offset1 > Offset0 &&
618 "Second offset should be larger than first offset!");
619 // If we have less than 16 loads in a row, and the offsets are within 64
620 // bytes, then schedule together.
621
622 // A cacheline is 64 bytes (for global memory).
623 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
624}
625
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 const char *Msg = "illegal VGPR to SGPR copy") {
631 MachineFunction *MF = MBB.getParent();
632
634 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
635
636 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
637 .addReg(SrcReg, getKillRegState(KillSrc));
638}
639
640/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
641/// possible to have a direct copy in these cases on GFX908, so an intermediate
642/// VGPR copy is required.
646 const DebugLoc &DL, MCRegister DestReg,
647 MCRegister SrcReg, bool KillSrc,
648 RegScavenger &RS, bool RegsOverlap,
649 Register ImpDefSuperReg = Register(),
650 Register ImpUseSuperReg = Register()) {
651 assert((TII.getSubtarget().hasMAIInsts() &&
652 !TII.getSubtarget().hasGFX90AInsts()) &&
653 "Expected GFX908 subtarget.");
654
655 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
656 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
657 "Source register of the copy should be either an SGPR or an AGPR.");
658
659 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
660 "Destination register of the copy should be an AGPR.");
661
662 const SIRegisterInfo &RI = TII.getRegisterInfo();
663
664 // First try to find defining accvgpr_write to avoid temporary registers.
665 // In the case of copies of overlapping AGPRs, we conservatively do not
666 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
667 // an accvgpr_write used for this same copy due to implicit-defs
668 if (!RegsOverlap) {
669 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
670 --Def;
671
672 if (!Def->modifiesRegister(SrcReg, &RI))
673 continue;
674
675 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
676 Def->getOperand(0).getReg() != SrcReg)
677 break;
678
679 MachineOperand &DefOp = Def->getOperand(1);
680 assert(DefOp.isReg() || DefOp.isImm());
681
682 if (DefOp.isReg()) {
683 bool SafeToPropagate = true;
684 // Check that register source operand is not clobbered before MI.
685 // Immediate operands are always safe to propagate.
686 for (auto I = Def; I != MI && SafeToPropagate; ++I)
687 if (I->modifiesRegister(DefOp.getReg(), &RI))
688 SafeToPropagate = false;
689
690 if (!SafeToPropagate)
691 break;
692
693 for (auto I = Def; I != MI; ++I)
694 I->clearRegisterKills(DefOp.getReg(), &RI);
695 }
696
697 MachineInstrBuilder Builder =
698 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
699 .add(DefOp);
700 if (ImpDefSuperReg)
701 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
702
703 if (ImpUseSuperReg) {
704 Builder.addReg(ImpUseSuperReg,
706 }
707
708 return;
709 }
710 }
711
712 RS.enterBasicBlockEnd(MBB);
713 RS.backward(std::next(MI));
714
715 // Ideally we want to have three registers for a long reg_sequence copy
716 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
717 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
718 *MBB.getParent());
719
720 // Registers in the sequence are allocated contiguously so we can just
721 // use register number to pick one of three round-robin temps.
722 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
723 Register Tmp =
724 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
725 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
726 "VGPR used for an intermediate copy should have been reserved.");
727
728 // Only loop through if there are any free registers left. We don't want to
729 // spill.
730 while (RegNo--) {
731 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
732 /* RestoreAfter */ false, 0,
733 /* AllowSpill */ false);
734 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
735 break;
736 Tmp = Tmp2;
737 RS.setRegUsed(Tmp);
738 }
739
740 // Insert copy to temporary VGPR.
741 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
742 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
743 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
744 } else {
745 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
746 }
747
748 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
749 .addReg(SrcReg, getKillRegState(KillSrc));
750 if (ImpUseSuperReg) {
751 UseBuilder.addReg(ImpUseSuperReg,
753 }
754
755 MachineInstrBuilder DefBuilder
756 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
757 .addReg(Tmp, RegState::Kill);
758
759 if (ImpDefSuperReg)
760 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
761}
762
765 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
766 const TargetRegisterClass *RC, bool Forward) {
767 const SIRegisterInfo &RI = TII.getRegisterInfo();
768 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
770 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
771
772 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
773 int16_t SubIdx = BaseIndices[Idx];
774 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
775 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
776 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
777 unsigned Opcode = AMDGPU::S_MOV_B32;
778
779 // Is SGPR aligned? If so try to combine with next.
780 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
781 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
782 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
783 // Can use SGPR64 copy
784 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
785 SubIdx = RI.getSubRegFromChannel(Channel, 2);
786 DestSubReg = RI.getSubReg(DestReg, SubIdx);
787 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
788 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
789 Opcode = AMDGPU::S_MOV_B64;
790 Idx++;
791 }
792
793 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
794 .addReg(SrcSubReg)
795 .addReg(SrcReg, RegState::Implicit);
796
797 if (!FirstMI)
798 FirstMI = LastMI;
799
800 if (!Forward)
801 I--;
802 }
803
804 assert(FirstMI && LastMI);
805 if (!Forward)
806 std::swap(FirstMI, LastMI);
807
808 FirstMI->addOperand(
809 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
810
811 if (KillSrc)
812 LastMI->addRegisterKilled(SrcReg, &RI);
813}
814
817 const DebugLoc &DL, Register DestReg,
818 Register SrcReg, bool KillSrc, bool RenamableDest,
819 bool RenamableSrc) const {
820 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
821 unsigned Size = RI.getRegSizeInBits(*RC);
822 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
823 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
824
825 // The rest of copyPhysReg assumes Src and Dst size are the same size.
826 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
827 // we remove Fix16BitCopies and this code block?
828 if (Fix16BitCopies) {
829 if (((Size == 16) != (SrcSize == 16))) {
830 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
831 assert(ST.useRealTrue16Insts());
832 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
833 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
834 RegToFix = SubReg;
835
836 if (DestReg == SrcReg) {
837 // Identity copy. Insert empty bundle since ExpandPostRA expects an
838 // instruction here.
839 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
840 return;
841 }
842 RC = RI.getPhysRegBaseClass(DestReg);
843 Size = RI.getRegSizeInBits(*RC);
844 SrcRC = RI.getPhysRegBaseClass(SrcReg);
845 SrcSize = RI.getRegSizeInBits(*SrcRC);
846 }
847 }
848
849 if (RC == &AMDGPU::VGPR_32RegClass) {
850 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
851 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
852 AMDGPU::AGPR_32RegClass.contains(SrcReg));
853 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
854 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
855 BuildMI(MBB, MI, DL, get(Opc), DestReg)
856 .addReg(SrcReg, getKillRegState(KillSrc));
857 return;
858 }
859
860 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
861 RC == &AMDGPU::SReg_32RegClass) {
862 if (SrcReg == AMDGPU::SCC) {
863 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
864 .addImm(1)
865 .addImm(0);
866 return;
867 }
868
869 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
870 if (DestReg == AMDGPU::VCC_LO) {
871 // FIXME: Hack until VReg_1 removed.
872 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
873 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
874 .addImm(0)
875 .addReg(SrcReg, getKillRegState(KillSrc));
876 return;
877 }
878
879 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
880 return;
881 }
882
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 return;
886 }
887
888 if (RC == &AMDGPU::SReg_64RegClass) {
889 if (SrcReg == AMDGPU::SCC) {
890 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
891 .addImm(1)
892 .addImm(0);
893 return;
894 }
895
896 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
897 if (DestReg == AMDGPU::VCC) {
898 // FIXME: Hack until VReg_1 removed.
899 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
900 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
901 .addImm(0)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
904 }
905
906 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
907 return;
908 }
909
910 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
911 .addReg(SrcReg, getKillRegState(KillSrc));
912 return;
913 }
914
915 if (DestReg == AMDGPU::SCC) {
916 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
917 // but SelectionDAG emits such copies for i1 sources.
918 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
919 // This copy can only be produced by patterns
920 // with explicit SCC, which are known to be enabled
921 // only for subtargets with S_CMP_LG_U64 present.
922 assert(ST.hasScalarCompareEq64());
923 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
924 .addReg(SrcReg, getKillRegState(KillSrc))
925 .addImm(0);
926 } else {
927 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
928 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
929 .addReg(SrcReg, getKillRegState(KillSrc))
930 .addImm(0);
931 }
932
933 return;
934 }
935
936 if (RC == &AMDGPU::AGPR_32RegClass) {
937 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
938 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
939 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
940 .addReg(SrcReg, getKillRegState(KillSrc));
941 return;
942 }
943
944 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
945 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
946 .addReg(SrcReg, getKillRegState(KillSrc));
947 return;
948 }
949
950 // FIXME: Pass should maintain scavenger to avoid scan through the block on
951 // every AGPR spill.
952 RegScavenger RS;
953 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
954 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
955 return;
956 }
957
958 if (Size == 16) {
959 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
960 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
961 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
962
963 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
964 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
965 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
966 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
967 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
968 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
969 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
970 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
971
972 if (IsSGPRDst) {
973 if (!IsSGPRSrc) {
974 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
975 return;
976 }
977
978 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
979 .addReg(NewSrcReg, getKillRegState(KillSrc));
980 return;
981 }
982
983 if (IsAGPRDst || IsAGPRSrc) {
984 if (!DstLow || !SrcLow) {
985 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
986 "Cannot use hi16 subreg with an AGPR!");
987 }
988
989 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
990 return;
991 }
992
993 if (ST.useRealTrue16Insts()) {
994 if (IsSGPRSrc) {
995 assert(SrcLow);
996 SrcReg = NewSrcReg;
997 }
998 // Use the smaller instruction encoding if possible.
999 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1000 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1001 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1002 .addReg(SrcReg);
1003 } else {
1004 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1005 .addImm(0) // src0_modifiers
1006 .addReg(SrcReg)
1007 .addImm(0); // op_sel
1008 }
1009 return;
1010 }
1011
1012 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1013 if (!DstLow || !SrcLow) {
1014 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1015 "Cannot use hi16 subreg on VI!");
1016 }
1017
1018 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1019 .addReg(NewSrcReg, getKillRegState(KillSrc));
1020 return;
1021 }
1022
1023 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1024 .addImm(0) // src0_modifiers
1025 .addReg(NewSrcReg)
1026 .addImm(0) // clamp
1033 // First implicit operand is $exec.
1034 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1035 return;
1036 }
1037
1038 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1039 if (ST.hasMovB64()) {
1040 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1041 .addReg(SrcReg, getKillRegState(KillSrc));
1042 return;
1043 }
1044 if (ST.hasPkMovB32()) {
1045 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1047 .addReg(SrcReg)
1049 .addReg(SrcReg)
1050 .addImm(0) // op_sel_lo
1051 .addImm(0) // op_sel_hi
1052 .addImm(0) // neg_lo
1053 .addImm(0) // neg_hi
1054 .addImm(0) // clamp
1055 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1056 return;
1057 }
1058 }
1059
1060 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1061 if (RI.isSGPRClass(RC)) {
1062 if (!RI.isSGPRClass(SrcRC)) {
1063 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1064 return;
1065 }
1066 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1067 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1068 Forward);
1069 return;
1070 }
1071
1072 unsigned EltSize = 4;
1073 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1074 if (RI.isAGPRClass(RC)) {
1075 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1076 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1077 else if (RI.hasVGPRs(SrcRC) ||
1078 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1079 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1080 else
1081 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1082 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1083 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1084 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1085 (RI.isProperlyAlignedRC(*RC) &&
1086 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1087 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1088 if (ST.hasMovB64()) {
1089 Opcode = AMDGPU::V_MOV_B64_e32;
1090 EltSize = 8;
1091 } else if (ST.hasPkMovB32()) {
1092 Opcode = AMDGPU::V_PK_MOV_B32;
1093 EltSize = 8;
1094 }
1095 }
1096
1097 // For the cases where we need an intermediate instruction/temporary register
1098 // (destination is an AGPR), we need a scavenger.
1099 //
1100 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1101 // whole block for every handled copy.
1102 std::unique_ptr<RegScavenger> RS;
1103 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1104 RS = std::make_unique<RegScavenger>();
1105
1106 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1107
1108 // If there is an overlap, we can't kill the super-register on the last
1109 // instruction, since it will also kill the components made live by this def.
1110 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1111 const bool CanKillSuperReg = KillSrc && !Overlap;
1112
1113 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1114 unsigned SubIdx;
1115 if (Forward)
1116 SubIdx = SubIndices[Idx];
1117 else
1118 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1119 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1120 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1121 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1122
1123 bool IsFirstSubreg = Idx == 0;
1124 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1125
1126 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1127 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1128 Register ImpUseSuper = SrcReg;
1129 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1130 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1131 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1133 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1135 .addReg(SrcSubReg)
1137 .addReg(SrcSubReg)
1138 .addImm(0) // op_sel_lo
1139 .addImm(0) // op_sel_hi
1140 .addImm(0) // neg_lo
1141 .addImm(0) // neg_hi
1142 .addImm(0) // clamp
1143 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144 if (IsFirstSubreg)
1146 } else {
1147 MachineInstrBuilder Builder =
1148 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1149 if (IsFirstSubreg)
1150 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1151
1152 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1153 }
1154 }
1155}
1156
1157int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1158 int NewOpc;
1159
1160 // Try to map original to commuted opcode
1161 NewOpc = AMDGPU::getCommuteRev(Opcode);
1162 if (NewOpc != -1)
1163 // Check if the commuted (REV) opcode exists on the target.
1164 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1165
1166 // Try to map commuted to original opcode
1167 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1168 if (NewOpc != -1)
1169 // Check if the original (non-REV) opcode exists on the target.
1170 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1171
1172 return Opcode;
1173}
1174
1175const TargetRegisterClass *
1177 return &AMDGPU::VGPR_32RegClass;
1178}
1179
1182 const DebugLoc &DL, Register DstReg,
1184 Register TrueReg,
1185 Register FalseReg) const {
1186 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1187 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1189 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1190 "Not a VGPR32 reg");
1191
1192 if (Cond.size() == 1) {
1193 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1194 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1195 .add(Cond[0]);
1196 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1197 .addImm(0)
1198 .addReg(FalseReg)
1199 .addImm(0)
1200 .addReg(TrueReg)
1201 .addReg(SReg);
1202 } else if (Cond.size() == 2) {
1203 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1204 switch (Cond[0].getImm()) {
1205 case SIInstrInfo::SCC_TRUE: {
1206 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1207 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1208 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1209 .addImm(0)
1210 .addReg(FalseReg)
1211 .addImm(0)
1212 .addReg(TrueReg)
1213 .addReg(SReg);
1214 break;
1215 }
1216 case SIInstrInfo::SCC_FALSE: {
1217 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1218 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1219 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1220 .addImm(0)
1221 .addReg(FalseReg)
1222 .addImm(0)
1223 .addReg(TrueReg)
1224 .addReg(SReg);
1225 break;
1226 }
1227 case SIInstrInfo::VCCNZ: {
1228 MachineOperand RegOp = Cond[1];
1229 RegOp.setImplicit(false);
1230 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1231 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1232 .add(RegOp);
1233 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1234 .addImm(0)
1235 .addReg(FalseReg)
1236 .addImm(0)
1237 .addReg(TrueReg)
1238 .addReg(SReg);
1239 break;
1240 }
1241 case SIInstrInfo::VCCZ: {
1242 MachineOperand RegOp = Cond[1];
1243 RegOp.setImplicit(false);
1244 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1245 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1246 .add(RegOp);
1247 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1248 .addImm(0)
1249 .addReg(TrueReg)
1250 .addImm(0)
1251 .addReg(FalseReg)
1252 .addReg(SReg);
1253 break;
1254 }
1255 case SIInstrInfo::EXECNZ: {
1256 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1257 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1258 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1259 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1260 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1261 .addImm(0)
1262 .addReg(FalseReg)
1263 .addImm(0)
1264 .addReg(TrueReg)
1265 .addReg(SReg);
1266 break;
1267 }
1268 case SIInstrInfo::EXECZ: {
1269 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1270 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1271 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1272 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1273 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1274 .addImm(0)
1275 .addReg(FalseReg)
1276 .addImm(0)
1277 .addReg(TrueReg)
1278 .addReg(SReg);
1279 llvm_unreachable("Unhandled branch predicate EXECZ");
1280 break;
1281 }
1282 default:
1283 llvm_unreachable("invalid branch predicate");
1284 }
1285 } else {
1286 llvm_unreachable("Can only handle Cond size 1 or 2");
1287 }
1288}
1289
1292 const DebugLoc &DL,
1293 Register SrcReg, int Value) const {
1294 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1295 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1296 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1297 .addImm(Value)
1298 .addReg(SrcReg);
1299
1300 return Reg;
1301}
1302
1305 const DebugLoc &DL,
1306 Register SrcReg, int Value) const {
1307 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1308 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1309 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1310 .addImm(Value)
1311 .addReg(SrcReg);
1312
1313 return Reg;
1314}
1315
1317 const Register Reg,
1318 int64_t &ImmVal) const {
1319 switch (MI.getOpcode()) {
1320 case AMDGPU::V_MOV_B32_e32:
1321 case AMDGPU::S_MOV_B32:
1322 case AMDGPU::S_MOVK_I32:
1323 case AMDGPU::S_MOV_B64:
1324 case AMDGPU::V_MOV_B64_e32:
1325 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1326 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1327 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1328 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1329 case AMDGPU::V_MOV_B64_PSEUDO:
1330 case AMDGPU::V_MOV_B16_t16_e32: {
1331 const MachineOperand &Src0 = MI.getOperand(1);
1332 if (Src0.isImm()) {
1333 ImmVal = Src0.getImm();
1334 return MI.getOperand(0).getReg() == Reg;
1335 }
1336
1337 return false;
1338 }
1339 case AMDGPU::V_MOV_B16_t16_e64: {
1340 const MachineOperand &Src0 = MI.getOperand(2);
1341 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1342 ImmVal = Src0.getImm();
1343 return MI.getOperand(0).getReg() == Reg;
1344 }
1345
1346 return false;
1347 }
1348 case AMDGPU::S_BREV_B32:
1349 case AMDGPU::V_BFREV_B32_e32:
1350 case AMDGPU::V_BFREV_B32_e64: {
1351 const MachineOperand &Src0 = MI.getOperand(1);
1352 if (Src0.isImm()) {
1353 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1354 return MI.getOperand(0).getReg() == Reg;
1355 }
1356
1357 return false;
1358 }
1359 case AMDGPU::S_NOT_B32:
1360 case AMDGPU::V_NOT_B32_e32:
1361 case AMDGPU::V_NOT_B32_e64: {
1362 const MachineOperand &Src0 = MI.getOperand(1);
1363 if (Src0.isImm()) {
1364 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1365 return MI.getOperand(0).getReg() == Reg;
1366 }
1367
1368 return false;
1369 }
1370 default:
1371 return false;
1372 }
1373}
1374
1376
1377 if (RI.isAGPRClass(DstRC))
1378 return AMDGPU::COPY;
1379 if (RI.getRegSizeInBits(*DstRC) == 16) {
1380 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1381 // before RA.
1382 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1383 }
1384 if (RI.getRegSizeInBits(*DstRC) == 32)
1385 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1386 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1387 return AMDGPU::S_MOV_B64;
1388 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1389 return AMDGPU::V_MOV_B64_PSEUDO;
1390 return AMDGPU::COPY;
1391}
1392
1393const MCInstrDesc &
1395 bool IsIndirectSrc) const {
1396 if (IsIndirectSrc) {
1397 if (VecSize <= 32) // 4 bytes
1398 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1399 if (VecSize <= 64) // 8 bytes
1400 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1401 if (VecSize <= 96) // 12 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1403 if (VecSize <= 128) // 16 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1405 if (VecSize <= 160) // 20 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1407 if (VecSize <= 192) // 24 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1409 if (VecSize <= 224) // 28 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1411 if (VecSize <= 256) // 32 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1413 if (VecSize <= 288) // 36 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1415 if (VecSize <= 320) // 40 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1417 if (VecSize <= 352) // 44 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1419 if (VecSize <= 384) // 48 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1421 if (VecSize <= 512) // 64 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1423 if (VecSize <= 1024) // 128 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1425
1426 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1427 }
1428
1429 if (VecSize <= 32) // 4 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1431 if (VecSize <= 64) // 8 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1433 if (VecSize <= 96) // 12 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1435 if (VecSize <= 128) // 16 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1437 if (VecSize <= 160) // 20 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1439 if (VecSize <= 192) // 24 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1441 if (VecSize <= 224) // 28 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1443 if (VecSize <= 256) // 32 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1445 if (VecSize <= 288) // 36 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1447 if (VecSize <= 320) // 40 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1449 if (VecSize <= 352) // 44 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1451 if (VecSize <= 384) // 48 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1453 if (VecSize <= 512) // 64 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1455 if (VecSize <= 1024) // 128 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1457
1458 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1459}
1460
1461static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1462 if (VecSize <= 32) // 4 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1464 if (VecSize <= 64) // 8 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1466 if (VecSize <= 96) // 12 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1468 if (VecSize <= 128) // 16 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1470 if (VecSize <= 160) // 20 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1472 if (VecSize <= 192) // 24 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1474 if (VecSize <= 224) // 28 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1476 if (VecSize <= 256) // 32 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1478 if (VecSize <= 288) // 36 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1480 if (VecSize <= 320) // 40 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1482 if (VecSize <= 352) // 44 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1484 if (VecSize <= 384) // 48 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1486 if (VecSize <= 512) // 64 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1488 if (VecSize <= 1024) // 128 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1490
1491 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1492}
1493
1494static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1495 if (VecSize <= 32) // 4 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1497 if (VecSize <= 64) // 8 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1499 if (VecSize <= 96) // 12 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1501 if (VecSize <= 128) // 16 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1503 if (VecSize <= 160) // 20 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1505 if (VecSize <= 192) // 24 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1507 if (VecSize <= 224) // 28 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1509 if (VecSize <= 256) // 32 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1511 if (VecSize <= 288) // 36 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1513 if (VecSize <= 320) // 40 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1515 if (VecSize <= 352) // 44 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1517 if (VecSize <= 384) // 48 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1519 if (VecSize <= 512) // 64 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1521 if (VecSize <= 1024) // 128 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1523
1524 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1525}
1526
1527static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1528 if (VecSize <= 64) // 8 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1530 if (VecSize <= 128) // 16 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1532 if (VecSize <= 256) // 32 bytes
1533 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1534 if (VecSize <= 512) // 64 bytes
1535 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1536 if (VecSize <= 1024) // 128 bytes
1537 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1538
1539 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1540}
1541
1542const MCInstrDesc &
1543SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1544 bool IsSGPR) const {
1545 if (IsSGPR) {
1546 switch (EltSize) {
1547 case 32:
1548 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1549 case 64:
1550 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1551 default:
1552 llvm_unreachable("invalid reg indexing elt size");
1553 }
1554 }
1555
1556 assert(EltSize == 32 && "invalid reg indexing elt size");
1558}
1559
1560static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1561 switch (Size) {
1562 case 4:
1563 return AMDGPU::SI_SPILL_S32_SAVE;
1564 case 8:
1565 return AMDGPU::SI_SPILL_S64_SAVE;
1566 case 12:
1567 return AMDGPU::SI_SPILL_S96_SAVE;
1568 case 16:
1569 return AMDGPU::SI_SPILL_S128_SAVE;
1570 case 20:
1571 return AMDGPU::SI_SPILL_S160_SAVE;
1572 case 24:
1573 return AMDGPU::SI_SPILL_S192_SAVE;
1574 case 28:
1575 return AMDGPU::SI_SPILL_S224_SAVE;
1576 case 32:
1577 return AMDGPU::SI_SPILL_S256_SAVE;
1578 case 36:
1579 return AMDGPU::SI_SPILL_S288_SAVE;
1580 case 40:
1581 return AMDGPU::SI_SPILL_S320_SAVE;
1582 case 44:
1583 return AMDGPU::SI_SPILL_S352_SAVE;
1584 case 48:
1585 return AMDGPU::SI_SPILL_S384_SAVE;
1586 case 64:
1587 return AMDGPU::SI_SPILL_S512_SAVE;
1588 case 128:
1589 return AMDGPU::SI_SPILL_S1024_SAVE;
1590 default:
1591 llvm_unreachable("unknown register size");
1592 }
1593}
1594
1595static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1596 switch (Size) {
1597 case 2:
1598 return AMDGPU::SI_SPILL_V16_SAVE;
1599 case 4:
1600 return AMDGPU::SI_SPILL_V32_SAVE;
1601 case 8:
1602 return AMDGPU::SI_SPILL_V64_SAVE;
1603 case 12:
1604 return AMDGPU::SI_SPILL_V96_SAVE;
1605 case 16:
1606 return AMDGPU::SI_SPILL_V128_SAVE;
1607 case 20:
1608 return AMDGPU::SI_SPILL_V160_SAVE;
1609 case 24:
1610 return AMDGPU::SI_SPILL_V192_SAVE;
1611 case 28:
1612 return AMDGPU::SI_SPILL_V224_SAVE;
1613 case 32:
1614 return AMDGPU::SI_SPILL_V256_SAVE;
1615 case 36:
1616 return AMDGPU::SI_SPILL_V288_SAVE;
1617 case 40:
1618 return AMDGPU::SI_SPILL_V320_SAVE;
1619 case 44:
1620 return AMDGPU::SI_SPILL_V352_SAVE;
1621 case 48:
1622 return AMDGPU::SI_SPILL_V384_SAVE;
1623 case 64:
1624 return AMDGPU::SI_SPILL_V512_SAVE;
1625 case 128:
1626 return AMDGPU::SI_SPILL_V1024_SAVE;
1627 default:
1628 llvm_unreachable("unknown register size");
1629 }
1630}
1631
1632static unsigned getAVSpillSaveOpcode(unsigned Size) {
1633 switch (Size) {
1634 case 4:
1635 return AMDGPU::SI_SPILL_AV32_SAVE;
1636 case 8:
1637 return AMDGPU::SI_SPILL_AV64_SAVE;
1638 case 12:
1639 return AMDGPU::SI_SPILL_AV96_SAVE;
1640 case 16:
1641 return AMDGPU::SI_SPILL_AV128_SAVE;
1642 case 20:
1643 return AMDGPU::SI_SPILL_AV160_SAVE;
1644 case 24:
1645 return AMDGPU::SI_SPILL_AV192_SAVE;
1646 case 28:
1647 return AMDGPU::SI_SPILL_AV224_SAVE;
1648 case 32:
1649 return AMDGPU::SI_SPILL_AV256_SAVE;
1650 case 36:
1651 return AMDGPU::SI_SPILL_AV288_SAVE;
1652 case 40:
1653 return AMDGPU::SI_SPILL_AV320_SAVE;
1654 case 44:
1655 return AMDGPU::SI_SPILL_AV352_SAVE;
1656 case 48:
1657 return AMDGPU::SI_SPILL_AV384_SAVE;
1658 case 64:
1659 return AMDGPU::SI_SPILL_AV512_SAVE;
1660 case 128:
1661 return AMDGPU::SI_SPILL_AV1024_SAVE;
1662 default:
1663 llvm_unreachable("unknown register size");
1664 }
1665}
1666
1667static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1668 bool IsVectorSuperClass) {
1669 // Currently, there is only 32-bit WWM register spills needed.
1670 if (Size != 4)
1671 llvm_unreachable("unknown wwm register spill size");
1672
1673 if (IsVectorSuperClass)
1674 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1675
1676 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1677}
1678
1680 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1681 const SIMachineFunctionInfo &MFI) const {
1682 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1683
1684 // Choose the right opcode if spilling a WWM register.
1686 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1687
1688 // TODO: Check if AGPRs are available
1689 if (ST.hasMAIInsts())
1690 return getAVSpillSaveOpcode(Size);
1691
1693}
1694
1697 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1698 MachineInstr::MIFlag Flags) const {
1699 MachineFunction *MF = MBB.getParent();
1701 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1702 const DebugLoc &DL = MBB.findDebugLoc(MI);
1703
1704 MachinePointerInfo PtrInfo
1705 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1707 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1708 FrameInfo.getObjectAlign(FrameIndex));
1709 unsigned SpillSize = RI.getSpillSize(*RC);
1710
1712 if (RI.isSGPRClass(RC)) {
1713 MFI->setHasSpilledSGPRs();
1714 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1715 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1716 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1717
1718 // We are only allowed to create one new instruction when spilling
1719 // registers, so we need to use pseudo instruction for spilling SGPRs.
1720 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1721
1722 // The SGPR spill/restore instructions only work on number sgprs, so we need
1723 // to make sure we are using the correct register class.
1724 if (SrcReg.isVirtual() && SpillSize == 4) {
1725 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1726 }
1727
1728 BuildMI(MBB, MI, DL, OpDesc)
1729 .addReg(SrcReg, getKillRegState(isKill)) // data
1730 .addFrameIndex(FrameIndex) // addr
1731 .addMemOperand(MMO)
1733
1734 if (RI.spillSGPRToVGPR())
1735 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1736 return;
1737 }
1738
1739 unsigned Opcode =
1740 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1741 MFI->setHasSpilledVGPRs();
1742
1743 BuildMI(MBB, MI, DL, get(Opcode))
1744 .addReg(SrcReg, getKillRegState(isKill)) // data
1745 .addFrameIndex(FrameIndex) // addr
1746 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1747 .addImm(0) // offset
1748 .addMemOperand(MMO);
1749}
1750
1751static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1752 switch (Size) {
1753 case 4:
1754 return AMDGPU::SI_SPILL_S32_RESTORE;
1755 case 8:
1756 return AMDGPU::SI_SPILL_S64_RESTORE;
1757 case 12:
1758 return AMDGPU::SI_SPILL_S96_RESTORE;
1759 case 16:
1760 return AMDGPU::SI_SPILL_S128_RESTORE;
1761 case 20:
1762 return AMDGPU::SI_SPILL_S160_RESTORE;
1763 case 24:
1764 return AMDGPU::SI_SPILL_S192_RESTORE;
1765 case 28:
1766 return AMDGPU::SI_SPILL_S224_RESTORE;
1767 case 32:
1768 return AMDGPU::SI_SPILL_S256_RESTORE;
1769 case 36:
1770 return AMDGPU::SI_SPILL_S288_RESTORE;
1771 case 40:
1772 return AMDGPU::SI_SPILL_S320_RESTORE;
1773 case 44:
1774 return AMDGPU::SI_SPILL_S352_RESTORE;
1775 case 48:
1776 return AMDGPU::SI_SPILL_S384_RESTORE;
1777 case 64:
1778 return AMDGPU::SI_SPILL_S512_RESTORE;
1779 case 128:
1780 return AMDGPU::SI_SPILL_S1024_RESTORE;
1781 default:
1782 llvm_unreachable("unknown register size");
1783 }
1784}
1785
1786static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1787 switch (Size) {
1788 case 2:
1789 return AMDGPU::SI_SPILL_V16_RESTORE;
1790 case 4:
1791 return AMDGPU::SI_SPILL_V32_RESTORE;
1792 case 8:
1793 return AMDGPU::SI_SPILL_V64_RESTORE;
1794 case 12:
1795 return AMDGPU::SI_SPILL_V96_RESTORE;
1796 case 16:
1797 return AMDGPU::SI_SPILL_V128_RESTORE;
1798 case 20:
1799 return AMDGPU::SI_SPILL_V160_RESTORE;
1800 case 24:
1801 return AMDGPU::SI_SPILL_V192_RESTORE;
1802 case 28:
1803 return AMDGPU::SI_SPILL_V224_RESTORE;
1804 case 32:
1805 return AMDGPU::SI_SPILL_V256_RESTORE;
1806 case 36:
1807 return AMDGPU::SI_SPILL_V288_RESTORE;
1808 case 40:
1809 return AMDGPU::SI_SPILL_V320_RESTORE;
1810 case 44:
1811 return AMDGPU::SI_SPILL_V352_RESTORE;
1812 case 48:
1813 return AMDGPU::SI_SPILL_V384_RESTORE;
1814 case 64:
1815 return AMDGPU::SI_SPILL_V512_RESTORE;
1816 case 128:
1817 return AMDGPU::SI_SPILL_V1024_RESTORE;
1818 default:
1819 llvm_unreachable("unknown register size");
1820 }
1821}
1822
1823static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1824 switch (Size) {
1825 case 4:
1826 return AMDGPU::SI_SPILL_AV32_RESTORE;
1827 case 8:
1828 return AMDGPU::SI_SPILL_AV64_RESTORE;
1829 case 12:
1830 return AMDGPU::SI_SPILL_AV96_RESTORE;
1831 case 16:
1832 return AMDGPU::SI_SPILL_AV128_RESTORE;
1833 case 20:
1834 return AMDGPU::SI_SPILL_AV160_RESTORE;
1835 case 24:
1836 return AMDGPU::SI_SPILL_AV192_RESTORE;
1837 case 28:
1838 return AMDGPU::SI_SPILL_AV224_RESTORE;
1839 case 32:
1840 return AMDGPU::SI_SPILL_AV256_RESTORE;
1841 case 36:
1842 return AMDGPU::SI_SPILL_AV288_RESTORE;
1843 case 40:
1844 return AMDGPU::SI_SPILL_AV320_RESTORE;
1845 case 44:
1846 return AMDGPU::SI_SPILL_AV352_RESTORE;
1847 case 48:
1848 return AMDGPU::SI_SPILL_AV384_RESTORE;
1849 case 64:
1850 return AMDGPU::SI_SPILL_AV512_RESTORE;
1851 case 128:
1852 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1853 default:
1854 llvm_unreachable("unknown register size");
1855 }
1856}
1857
1858static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1859 bool IsVectorSuperClass) {
1860 // Currently, there is only 32-bit WWM register spills needed.
1861 if (Size != 4)
1862 llvm_unreachable("unknown wwm register spill size");
1863
1864 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1865 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1866
1867 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1868}
1869
1871 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1872 const SIMachineFunctionInfo &MFI) const {
1873 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1874
1875 // Choose the right opcode if restoring a WWM register.
1877 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1878
1879 // TODO: Check if AGPRs are available
1880 if (ST.hasMAIInsts())
1882
1883 assert(!RI.isAGPRClass(RC));
1885}
1886
1889 Register DestReg, int FrameIndex,
1890 const TargetRegisterClass *RC,
1891 Register VReg, unsigned SubReg,
1892 MachineInstr::MIFlag Flags) const {
1893 MachineFunction *MF = MBB.getParent();
1895 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1896 const DebugLoc &DL = MBB.findDebugLoc(MI);
1897 unsigned SpillSize = RI.getSpillSize(*RC);
1898
1899 MachinePointerInfo PtrInfo
1900 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1901
1903 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1904 FrameInfo.getObjectAlign(FrameIndex));
1905
1906 if (RI.isSGPRClass(RC)) {
1907 MFI->setHasSpilledSGPRs();
1908 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1909 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1910 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1911
1912 // FIXME: Maybe this should not include a memoperand because it will be
1913 // lowered to non-memory instructions.
1914 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1915 if (DestReg.isVirtual() && SpillSize == 4) {
1917 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1918 }
1919
1920 if (RI.spillSGPRToVGPR())
1921 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1922 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1923 .addFrameIndex(FrameIndex) // addr
1924 .addMemOperand(MMO)
1926
1927 return;
1928 }
1929
1930 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1931 SpillSize, *MFI);
1932 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1933 .addFrameIndex(FrameIndex) // vaddr
1934 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1935 .addImm(0) // offset
1936 .addMemOperand(MMO);
1937}
1938
1943
1946 unsigned Quantity) const {
1947 DebugLoc DL = MBB.findDebugLoc(MI);
1948 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1949 while (Quantity > 0) {
1950 unsigned Arg = std::min(Quantity, MaxSNopCount);
1951 Quantity -= Arg;
1952 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1953 }
1954}
1955
1957 auto *MF = MBB.getParent();
1958 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1959
1960 assert(Info->isEntryFunction());
1961
1962 if (MBB.succ_empty()) {
1963 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1964 if (HasNoTerminator) {
1965 if (Info->returnsVoid()) {
1966 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1967 } else {
1968 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1969 }
1970 }
1971 }
1972}
1973
1977 const DebugLoc &DL) const {
1978 MachineFunction *MF = MBB.getParent();
1979 constexpr unsigned DoorbellIDMask = 0x3ff;
1980 constexpr unsigned ECQueueWaveAbort = 0x400;
1981
1982 MachineBasicBlock *TrapBB = &MBB;
1983 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1984
1985 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1986 MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1987 TrapBB = MF->CreateMachineBasicBlock();
1988 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1989 MF->push_back(TrapBB);
1990 MBB.addSuccessor(TrapBB);
1991 }
1992 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1993 // will be a nop.
1994 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1995 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1996 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1997 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1998 DoorbellReg)
2000 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2001 .addUse(AMDGPU::M0);
2002 Register DoorbellRegMasked =
2003 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2004 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2005 .addUse(DoorbellReg)
2006 .addImm(DoorbellIDMask);
2007 Register SetWaveAbortBit =
2008 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2009 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2010 .addUse(DoorbellRegMasked)
2011 .addImm(ECQueueWaveAbort);
2012 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2013 .addUse(SetWaveAbortBit);
2014 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2016 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2017 .addUse(AMDGPU::TTMP2);
2018 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2019 TrapBB->addSuccessor(HaltLoopBB);
2020
2021 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2022 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2023 .addMBB(HaltLoopBB);
2024 MF->push_back(HaltLoopBB);
2025 HaltLoopBB->addSuccessor(HaltLoopBB);
2026
2027 return MBB.getNextNode();
2028}
2029
2031 switch (MI.getOpcode()) {
2032 default:
2033 if (MI.isMetaInstruction())
2034 return 0;
2035 return 1; // FIXME: Do wait states equal cycles?
2036
2037 case AMDGPU::S_NOP:
2038 return MI.getOperand(0).getImm() + 1;
2039 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2040 // hazard, even if one exist, won't really be visible. Should we handle it?
2041 }
2042}
2043
2045 MachineBasicBlock &MBB = *MI.getParent();
2046 DebugLoc DL = MBB.findDebugLoc(MI);
2048 switch (MI.getOpcode()) {
2049 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2050 case AMDGPU::S_MOV_B64_term:
2051 // This is only a terminator to get the correct spill code placement during
2052 // register allocation.
2053 MI.setDesc(get(AMDGPU::S_MOV_B64));
2054 break;
2055
2056 case AMDGPU::S_MOV_B32_term:
2057 // This is only a terminator to get the correct spill code placement during
2058 // register allocation.
2059 MI.setDesc(get(AMDGPU::S_MOV_B32));
2060 break;
2061
2062 case AMDGPU::S_XOR_B64_term:
2063 // This is only a terminator to get the correct spill code placement during
2064 // register allocation.
2065 MI.setDesc(get(AMDGPU::S_XOR_B64));
2066 break;
2067
2068 case AMDGPU::S_XOR_B32_term:
2069 // This is only a terminator to get the correct spill code placement during
2070 // register allocation.
2071 MI.setDesc(get(AMDGPU::S_XOR_B32));
2072 break;
2073 case AMDGPU::S_OR_B64_term:
2074 // This is only a terminator to get the correct spill code placement during
2075 // register allocation.
2076 MI.setDesc(get(AMDGPU::S_OR_B64));
2077 break;
2078 case AMDGPU::S_OR_B32_term:
2079 // This is only a terminator to get the correct spill code placement during
2080 // register allocation.
2081 MI.setDesc(get(AMDGPU::S_OR_B32));
2082 break;
2083
2084 case AMDGPU::S_ANDN2_B64_term:
2085 // This is only a terminator to get the correct spill code placement during
2086 // register allocation.
2087 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2088 break;
2089
2090 case AMDGPU::S_ANDN2_B32_term:
2091 // This is only a terminator to get the correct spill code placement during
2092 // register allocation.
2093 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2094 break;
2095
2096 case AMDGPU::S_AND_B64_term:
2097 // This is only a terminator to get the correct spill code placement during
2098 // register allocation.
2099 MI.setDesc(get(AMDGPU::S_AND_B64));
2100 break;
2101
2102 case AMDGPU::S_AND_B32_term:
2103 // This is only a terminator to get the correct spill code placement during
2104 // register allocation.
2105 MI.setDesc(get(AMDGPU::S_AND_B32));
2106 break;
2107
2108 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2109 // This is only a terminator to get the correct spill code placement during
2110 // register allocation.
2111 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2112 break;
2113
2114 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2115 // This is only a terminator to get the correct spill code placement during
2116 // register allocation.
2117 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2118 break;
2119
2120 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2121 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2122 break;
2123
2124 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2125 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2126 break;
2127 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2128 Register Dst = MI.getOperand(0).getReg();
2129 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2130 MI.setDesc(
2131 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2132 break;
2133 }
2134 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2135 Register Dst = MI.getOperand(0).getReg();
2136 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2137 int64_t Imm = MI.getOperand(1).getImm();
2138
2139 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2140 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2141 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2144 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2145 .addImm(SignExtend64<32>(Imm >> 32))
2147 MI.eraseFromParent();
2148 break;
2149 }
2150
2151 [[fallthrough]];
2152 }
2153 case AMDGPU::V_MOV_B64_PSEUDO: {
2154 Register Dst = MI.getOperand(0).getReg();
2155 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2156 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2157
2158 const MachineOperand &SrcOp = MI.getOperand(1);
2159 // FIXME: Will this work for 64-bit floating point immediates?
2160 assert(!SrcOp.isFPImm());
2161 if (ST.hasMovB64()) {
2162 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2163 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2164 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2165 break;
2166 }
2167 if (SrcOp.isImm()) {
2168 APInt Imm(64, SrcOp.getImm());
2169 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2170 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2171 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2172 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2174 .addImm(Lo.getSExtValue())
2176 .addImm(Lo.getSExtValue())
2177 .addImm(0) // op_sel_lo
2178 .addImm(0) // op_sel_hi
2179 .addImm(0) // neg_lo
2180 .addImm(0) // neg_hi
2181 .addImm(0); // clamp
2182 } else {
2183 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2184 .addImm(Lo.getSExtValue())
2186 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2187 .addImm(Hi.getSExtValue())
2189 }
2190 } else {
2191 assert(SrcOp.isReg());
2192 if (ST.hasPkMovB32() &&
2193 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2194 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2195 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2196 .addReg(SrcOp.getReg())
2198 .addReg(SrcOp.getReg())
2199 .addImm(0) // op_sel_lo
2200 .addImm(0) // op_sel_hi
2201 .addImm(0) // neg_lo
2202 .addImm(0) // neg_hi
2203 .addImm(0); // clamp
2204 } else {
2205 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2206 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2208 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2209 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2211 }
2212 }
2213 MI.eraseFromParent();
2214 break;
2215 }
2216 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2218 break;
2219 }
2220 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2221 const MachineOperand &SrcOp = MI.getOperand(1);
2222 assert(!SrcOp.isFPImm());
2223
2224 if (ST.has64BitLiterals()) {
2225 MI.setDesc(get(AMDGPU::S_MOV_B64));
2226 break;
2227 }
2228
2229 APInt Imm(64, SrcOp.getImm());
2230 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2231 MI.setDesc(get(AMDGPU::S_MOV_B64));
2232 break;
2233 }
2234
2235 Register Dst = MI.getOperand(0).getReg();
2236 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2237 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2238
2239 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2240 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2241 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2242 .addImm(Lo.getSExtValue())
2244 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2245 .addImm(Hi.getSExtValue())
2247 MI.eraseFromParent();
2248 break;
2249 }
2250 case AMDGPU::V_SET_INACTIVE_B32: {
2251 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2252 Register DstReg = MI.getOperand(0).getReg();
2253 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2254 .add(MI.getOperand(3))
2255 .add(MI.getOperand(4))
2256 .add(MI.getOperand(1))
2257 .add(MI.getOperand(2))
2258 .add(MI.getOperand(5));
2259 MI.eraseFromParent();
2260 break;
2261 }
2262 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2263 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2264 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2265 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2266 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2267 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2268 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2269 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2270 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2271 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2272 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2273 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2274 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2275 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2276 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2280 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2281 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2282 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2283 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2284 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2285 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2286 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2287 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2288 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2289 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2290 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2291 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2292 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2293 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2294 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2295 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2296
2297 unsigned Opc;
2298 if (RI.hasVGPRs(EltRC)) {
2299 Opc = AMDGPU::V_MOVRELD_B32_e32;
2300 } else {
2301 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2302 : AMDGPU::S_MOVRELD_B32;
2303 }
2304
2305 const MCInstrDesc &OpDesc = get(Opc);
2306 Register VecReg = MI.getOperand(0).getReg();
2307 bool IsUndef = MI.getOperand(1).isUndef();
2308 unsigned SubReg = MI.getOperand(3).getImm();
2309 assert(VecReg == MI.getOperand(1).getReg());
2310
2312 BuildMI(MBB, MI, DL, OpDesc)
2313 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2314 .add(MI.getOperand(2))
2316 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2317
2318 const int ImpDefIdx =
2319 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2320 const int ImpUseIdx = ImpDefIdx + 1;
2321 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2322 MI.eraseFromParent();
2323 break;
2324 }
2325 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2326 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2327 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2328 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2329 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2330 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2331 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2332 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2333 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2334 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2335 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2336 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2337 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2338 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2339 assert(ST.useVGPRIndexMode());
2340 Register VecReg = MI.getOperand(0).getReg();
2341 bool IsUndef = MI.getOperand(1).isUndef();
2342 MachineOperand &Idx = MI.getOperand(3);
2343 Register SubReg = MI.getOperand(4).getImm();
2344
2345 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2346 .add(Idx)
2348 SetOn->getOperand(3).setIsUndef();
2349
2350 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2352 BuildMI(MBB, MI, DL, OpDesc)
2353 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2354 .add(MI.getOperand(2))
2356 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2357
2358 const int ImpDefIdx =
2359 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2360 const int ImpUseIdx = ImpDefIdx + 1;
2361 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2362
2363 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2364
2365 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2366
2367 MI.eraseFromParent();
2368 break;
2369 }
2370 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2371 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2372 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2373 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2374 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2375 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2376 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2377 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2378 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2379 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2380 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2381 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2382 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2383 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2384 assert(ST.useVGPRIndexMode());
2385 Register Dst = MI.getOperand(0).getReg();
2386 Register VecReg = MI.getOperand(1).getReg();
2387 bool IsUndef = MI.getOperand(1).isUndef();
2388 Register Idx = MI.getOperand(2).getReg();
2389 Register SubReg = MI.getOperand(3).getImm();
2390
2391 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2392 .addReg(Idx)
2394 SetOn->getOperand(3).setIsUndef();
2395
2396 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2397 .addDef(Dst)
2398 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2399 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2400
2401 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2402
2403 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2404
2405 MI.eraseFromParent();
2406 break;
2407 }
2408 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2409 MachineFunction &MF = *MBB.getParent();
2410 Register Reg = MI.getOperand(0).getReg();
2411 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2412 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2413 MachineOperand OpLo = MI.getOperand(1);
2414 MachineOperand OpHi = MI.getOperand(2);
2415
2416 // Create a bundle so these instructions won't be re-ordered by the
2417 // post-RA scheduler.
2418 MIBundleBuilder Bundler(MBB, MI);
2419 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2420
2421 // What we want here is an offset from the value returned by s_getpc (which
2422 // is the address of the s_add_u32 instruction) to the global variable, but
2423 // since the encoding of $symbol starts 4 bytes after the start of the
2424 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2425 // small. This requires us to add 4 to the global variable offset in order
2426 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2427 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2428 // instruction.
2429
2430 int64_t Adjust = 0;
2431 if (ST.hasGetPCZeroExtension()) {
2432 // Fix up hardware that does not sign-extend the 48-bit PC value by
2433 // inserting: s_sext_i32_i16 reghi, reghi
2434 Bundler.append(
2435 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2436 Adjust += 4;
2437 }
2438
2439 if (OpLo.isGlobal())
2440 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2441 Bundler.append(
2442 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2443
2444 if (OpHi.isGlobal())
2445 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2446 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2447 .addReg(RegHi)
2448 .add(OpHi));
2449
2450 finalizeBundle(MBB, Bundler.begin());
2451
2452 MI.eraseFromParent();
2453 break;
2454 }
2455 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2456 MachineFunction &MF = *MBB.getParent();
2457 Register Reg = MI.getOperand(0).getReg();
2458 MachineOperand Op = MI.getOperand(1);
2459
2460 // Create a bundle so these instructions won't be re-ordered by the
2461 // post-RA scheduler.
2462 MIBundleBuilder Bundler(MBB, MI);
2463 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2464 if (Op.isGlobal())
2465 Op.setOffset(Op.getOffset() + 4);
2466 Bundler.append(
2467 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2468
2469 finalizeBundle(MBB, Bundler.begin());
2470
2471 MI.eraseFromParent();
2472 break;
2473 }
2474 case AMDGPU::ENTER_STRICT_WWM: {
2475 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2476 // Whole Wave Mode is entered.
2477 MI.setDesc(get(LMC.OrSaveExecOpc));
2478 break;
2479 }
2480 case AMDGPU::ENTER_STRICT_WQM: {
2481 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2482 // STRICT_WQM is entered.
2483 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2484 .addReg(LMC.ExecReg);
2485 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2486
2487 MI.eraseFromParent();
2488 break;
2489 }
2490 case AMDGPU::EXIT_STRICT_WWM:
2491 case AMDGPU::EXIT_STRICT_WQM: {
2492 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2493 // WWM/STICT_WQM is exited.
2494 MI.setDesc(get(LMC.MovOpc));
2495 break;
2496 }
2497 case AMDGPU::SI_RETURN: {
2498 const MachineFunction *MF = MBB.getParent();
2499 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2500 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2501 // Hiding the return address use with SI_RETURN may lead to extra kills in
2502 // the function and missing live-ins. We are fine in practice because callee
2503 // saved register handling ensures the register value is restored before
2504 // RET, but we need the undef flag here to appease the MachineVerifier
2505 // liveness checks.
2507 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2508 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2509
2510 MIB.copyImplicitOps(MI);
2511 MI.eraseFromParent();
2512 break;
2513 }
2514
2515 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2516 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2517 MI.setDesc(get(AMDGPU::S_MUL_U64));
2518 break;
2519
2520 case AMDGPU::S_GETPC_B64_pseudo:
2521 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2522 if (ST.hasGetPCZeroExtension()) {
2523 Register Dst = MI.getOperand(0).getReg();
2524 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2525 // Fix up hardware that does not sign-extend the 48-bit PC value by
2526 // inserting: s_sext_i32_i16 dsthi, dsthi
2527 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2528 DstHi)
2529 .addReg(DstHi);
2530 }
2531 break;
2532
2533 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2534 assert(ST.hasBF16PackedInsts());
2535 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2536 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2537 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2538 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2539 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2540 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2541 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2542 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2543 break;
2544 }
2545
2546 return true;
2547}
2548
2551 unsigned SubIdx,
2552 const MachineInstr &Orig) const {
2553
2554 // Try shrinking the instruction to remat only the part needed for current
2555 // context.
2556 // TODO: Handle more cases.
2557 unsigned Opcode = Orig.getOpcode();
2558 switch (Opcode) {
2559 case AMDGPU::S_LOAD_DWORDX16_IMM:
2560 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2561 if (SubIdx != 0)
2562 break;
2563
2564 if (I == MBB.end())
2565 break;
2566
2567 if (I->isBundled())
2568 break;
2569
2570 // Look for a single use of the register that is also a subreg.
2571 Register RegToFind = Orig.getOperand(0).getReg();
2572 MachineOperand *UseMO = nullptr;
2573 for (auto &CandMO : I->operands()) {
2574 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2575 continue;
2576 if (UseMO) {
2577 UseMO = nullptr;
2578 break;
2579 }
2580 UseMO = &CandMO;
2581 }
2582 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2583 break;
2584
2585 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2586 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2587
2588 MachineFunction *MF = MBB.getParent();
2590 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2591
2592 unsigned NewOpcode = -1;
2593 if (SubregSize == 256)
2594 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2595 else if (SubregSize == 128)
2596 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2597 else
2598 break;
2599
2600 const MCInstrDesc &TID = get(NewOpcode);
2601 const TargetRegisterClass *NewRC =
2602 RI.getAllocatableClass(getRegClass(TID, 0));
2603 MRI.setRegClass(DestReg, NewRC);
2604
2605 UseMO->setReg(DestReg);
2606 UseMO->setSubReg(AMDGPU::NoSubRegister);
2607
2608 // Use a smaller load with the desired size, possibly with updated offset.
2609 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2610 MI->setDesc(TID);
2611 MI->getOperand(0).setReg(DestReg);
2612 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2613 if (Offset) {
2614 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2615 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2616 OffsetMO->setImm(FinalOffset);
2617 }
2619 for (const MachineMemOperand *MemOp : Orig.memoperands())
2620 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2621 SubregSize / 8));
2622 MI->setMemRefs(*MF, NewMMOs);
2623
2624 MBB.insert(I, MI);
2625 return;
2626 }
2627
2628 default:
2629 break;
2630 }
2631
2632 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
2633}
2634
2635std::pair<MachineInstr*, MachineInstr*>
2637 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2638
2639 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2641 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2642 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2643 return std::pair(&MI, nullptr);
2644 }
2645
2646 MachineBasicBlock &MBB = *MI.getParent();
2647 DebugLoc DL = MBB.findDebugLoc(MI);
2648 MachineFunction *MF = MBB.getParent();
2650 Register Dst = MI.getOperand(0).getReg();
2651 unsigned Part = 0;
2652 MachineInstr *Split[2];
2653
2654 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2655 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2656 if (Dst.isPhysical()) {
2657 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2658 } else {
2659 assert(MRI.isSSA());
2660 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2661 MovDPP.addDef(Tmp);
2662 }
2663
2664 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2665 const MachineOperand &SrcOp = MI.getOperand(I);
2666 assert(!SrcOp.isFPImm());
2667 if (SrcOp.isImm()) {
2668 APInt Imm(64, SrcOp.getImm());
2669 Imm.ashrInPlace(Part * 32);
2670 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2671 } else {
2672 assert(SrcOp.isReg());
2673 Register Src = SrcOp.getReg();
2674 if (Src.isPhysical())
2675 MovDPP.addReg(RI.getSubReg(Src, Sub));
2676 else
2677 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2678 }
2679 }
2680
2681 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2682 MovDPP.addImm(MO.getImm());
2683
2684 Split[Part] = MovDPP;
2685 ++Part;
2686 }
2687
2688 if (Dst.isVirtual())
2689 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2690 .addReg(Split[0]->getOperand(0).getReg())
2691 .addImm(AMDGPU::sub0)
2692 .addReg(Split[1]->getOperand(0).getReg())
2693 .addImm(AMDGPU::sub1);
2694
2695 MI.eraseFromParent();
2696 return std::pair(Split[0], Split[1]);
2697}
2698
2699std::optional<DestSourcePair>
2701 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2702 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2703
2704 return std::nullopt;
2705}
2706
2708 AMDGPU::OpName Src0OpName,
2709 MachineOperand &Src1,
2710 AMDGPU::OpName Src1OpName) const {
2711 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2712 if (!Src0Mods)
2713 return false;
2714
2715 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2716 assert(Src1Mods &&
2717 "All commutable instructions have both src0 and src1 modifiers");
2718
2719 int Src0ModsVal = Src0Mods->getImm();
2720 int Src1ModsVal = Src1Mods->getImm();
2721
2722 Src1Mods->setImm(Src0ModsVal);
2723 Src0Mods->setImm(Src1ModsVal);
2724 return true;
2725}
2726
2728 MachineOperand &RegOp,
2729 MachineOperand &NonRegOp) {
2730 Register Reg = RegOp.getReg();
2731 unsigned SubReg = RegOp.getSubReg();
2732 bool IsKill = RegOp.isKill();
2733 bool IsDead = RegOp.isDead();
2734 bool IsUndef = RegOp.isUndef();
2735 bool IsDebug = RegOp.isDebug();
2736
2737 if (NonRegOp.isImm())
2738 RegOp.ChangeToImmediate(NonRegOp.getImm());
2739 else if (NonRegOp.isFI())
2740 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2741 else if (NonRegOp.isGlobal()) {
2742 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2743 NonRegOp.getTargetFlags());
2744 } else
2745 return nullptr;
2746
2747 // Make sure we don't reinterpret a subreg index in the target flags.
2748 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2749
2750 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2751 NonRegOp.setSubReg(SubReg);
2752
2753 return &MI;
2754}
2755
2757 MachineOperand &NonRegOp1,
2758 MachineOperand &NonRegOp2) {
2759 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2760 int64_t NonRegVal = NonRegOp1.getImm();
2761
2762 NonRegOp1.setImm(NonRegOp2.getImm());
2763 NonRegOp2.setImm(NonRegVal);
2764 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2765 NonRegOp2.setTargetFlags(TargetFlags);
2766 return &MI;
2767}
2768
2769bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2770 unsigned OpIdx1) const {
2771 const MCInstrDesc &InstDesc = MI.getDesc();
2772 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2773 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2774
2775 unsigned Opc = MI.getOpcode();
2776 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2777
2778 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2779 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2780
2781 // Swap doesn't breach constant bus or literal limits
2782 // It may move literal to position other than src0, this is not allowed
2783 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2784 // FIXME: After gfx9, literal can be in place other than Src0
2785 if (isVALU(MI)) {
2786 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2787 !isInlineConstant(MO0, OpInfo1))
2788 return false;
2789 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2790 !isInlineConstant(MO1, OpInfo0))
2791 return false;
2792 }
2793
2794 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2795 if (OpInfo1.RegClass == -1)
2796 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2797 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2798 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2799 }
2800 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2801 if (OpInfo0.RegClass == -1)
2802 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2803 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2804 isLegalRegOperand(MI, OpIdx0, MO1);
2805 }
2806
2807 // No need to check 64-bit literals since swapping does not bring new
2808 // 64-bit literals into current instruction to fold to 32-bit
2809
2810 return isImmOperandLegal(MI, OpIdx1, MO0);
2811}
2812
2814 unsigned Src0Idx,
2815 unsigned Src1Idx) const {
2816 assert(!NewMI && "this should never be used");
2817
2818 unsigned Opc = MI.getOpcode();
2819 int CommutedOpcode = commuteOpcode(Opc);
2820 if (CommutedOpcode == -1)
2821 return nullptr;
2822
2823 if (Src0Idx > Src1Idx)
2824 std::swap(Src0Idx, Src1Idx);
2825
2826 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2827 static_cast<int>(Src0Idx) &&
2828 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2829 static_cast<int>(Src1Idx) &&
2830 "inconsistency with findCommutedOpIndices");
2831
2832 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2833 return nullptr;
2834
2835 MachineInstr *CommutedMI = nullptr;
2836 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2837 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2838 if (Src0.isReg() && Src1.isReg()) {
2839 // Be sure to copy the source modifiers to the right place.
2840 CommutedMI =
2841 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2842 } else if (Src0.isReg() && !Src1.isReg()) {
2843 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2844 } else if (!Src0.isReg() && Src1.isReg()) {
2845 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2846 } else if (Src0.isImm() && Src1.isImm()) {
2847 CommutedMI = swapImmOperands(MI, Src0, Src1);
2848 } else {
2849 // FIXME: Found two non registers to commute. This does happen.
2850 return nullptr;
2851 }
2852
2853 if (CommutedMI) {
2854 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2855 Src1, AMDGPU::OpName::src1_modifiers);
2856
2857 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2858 AMDGPU::OpName::src1_sel);
2859
2860 CommutedMI->setDesc(get(CommutedOpcode));
2861 }
2862
2863 return CommutedMI;
2864}
2865
2866// This needs to be implemented because the source modifiers may be inserted
2867// between the true commutable operands, and the base
2868// TargetInstrInfo::commuteInstruction uses it.
2870 unsigned &SrcOpIdx0,
2871 unsigned &SrcOpIdx1) const {
2872 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2873}
2874
2876 unsigned &SrcOpIdx0,
2877 unsigned &SrcOpIdx1) const {
2878 if (!Desc.isCommutable())
2879 return false;
2880
2881 unsigned Opc = Desc.getOpcode();
2882 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2883 if (Src0Idx == -1)
2884 return false;
2885
2886 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2887 if (Src1Idx == -1)
2888 return false;
2889
2890 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2891}
2892
2894 int64_t BrOffset) const {
2895 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2896 // because its dest block is unanalyzable.
2897 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2898
2899 // Convert to dwords.
2900 BrOffset /= 4;
2901
2902 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2903 // from the next instruction.
2904 BrOffset -= 1;
2905
2906 return isIntN(BranchOffsetBits, BrOffset);
2907}
2908
2911 return MI.getOperand(0).getMBB();
2912}
2913
2915 for (const MachineInstr &MI : MBB->terminators()) {
2916 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2917 MI.getOpcode() == AMDGPU::SI_LOOP)
2918 return true;
2919 }
2920 return false;
2921}
2922
2924 MachineBasicBlock &DestBB,
2925 MachineBasicBlock &RestoreBB,
2926 const DebugLoc &DL, int64_t BrOffset,
2927 RegScavenger *RS) const {
2928 assert(MBB.empty() &&
2929 "new block should be inserted for expanding unconditional branch");
2930 assert(MBB.pred_size() == 1);
2931 assert(RestoreBB.empty() &&
2932 "restore block should be inserted for restoring clobbered registers");
2933
2934 MachineFunction *MF = MBB.getParent();
2937 auto I = MBB.end();
2938 auto &MCCtx = MF->getContext();
2939
2940 if (ST.useAddPC64Inst()) {
2941 MCSymbol *Offset =
2942 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2943 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2945 MCSymbol *PostAddPCLabel =
2946 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2947 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2948 auto *OffsetExpr = MCBinaryExpr::createSub(
2949 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2950 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2951 Offset->setVariableValue(OffsetExpr);
2952 return;
2953 }
2954
2955 assert(RS && "RegScavenger required for long branching");
2956
2957 // FIXME: Virtual register workaround for RegScavenger not working with empty
2958 // blocks.
2959 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2960
2961 // Note: as this is used after hazard recognizer we need to apply some hazard
2962 // workarounds directly.
2963 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2964 ST.hasVALUReadSGPRHazard();
2965 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2966 if (FlushSGPRWrites)
2967 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2969 };
2970
2971 // We need to compute the offset relative to the instruction immediately after
2972 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2973 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2974 ApplyHazardWorkarounds();
2975
2976 MCSymbol *PostGetPCLabel =
2977 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2978 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2979
2980 MCSymbol *OffsetLo =
2981 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2982 MCSymbol *OffsetHi =
2983 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2984 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2985 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2986 .addReg(PCReg, 0, AMDGPU::sub0)
2987 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2988 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2989 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2990 .addReg(PCReg, 0, AMDGPU::sub1)
2991 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2992 ApplyHazardWorkarounds();
2993
2994 // Insert the indirect branch after the other terminator.
2995 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2996 .addReg(PCReg);
2997
2998 // If a spill is needed for the pc register pair, we need to insert a spill
2999 // restore block right before the destination block, and insert a short branch
3000 // into the old destination block's fallthrough predecessor.
3001 // e.g.:
3002 //
3003 // s_cbranch_scc0 skip_long_branch:
3004 //
3005 // long_branch_bb:
3006 // spill s[8:9]
3007 // s_getpc_b64 s[8:9]
3008 // s_add_u32 s8, s8, restore_bb
3009 // s_addc_u32 s9, s9, 0
3010 // s_setpc_b64 s[8:9]
3011 //
3012 // skip_long_branch:
3013 // foo;
3014 //
3015 // .....
3016 //
3017 // dest_bb_fallthrough_predecessor:
3018 // bar;
3019 // s_branch dest_bb
3020 //
3021 // restore_bb:
3022 // restore s[8:9]
3023 // fallthrough dest_bb
3024 ///
3025 // dest_bb:
3026 // buzz;
3027
3028 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3029 Register Scav;
3030
3031 // If we've previously reserved a register for long branches
3032 // avoid running the scavenger and just use those registers
3033 if (LongBranchReservedReg) {
3034 RS->enterBasicBlock(MBB);
3035 Scav = LongBranchReservedReg;
3036 } else {
3037 RS->enterBasicBlockEnd(MBB);
3038 Scav = RS->scavengeRegisterBackwards(
3039 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3040 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3041 }
3042 if (Scav) {
3043 RS->setRegUsed(Scav);
3044 MRI.replaceRegWith(PCReg, Scav);
3045 MRI.clearVirtRegs();
3046 } else {
3047 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3048 // SGPR spill.
3049 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3050 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3051 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3052 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3053 MRI.clearVirtRegs();
3054 }
3055
3056 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3057 // Now, the distance could be defined.
3059 MCSymbolRefExpr::create(DestLabel, MCCtx),
3060 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3061 // Add offset assignments.
3062 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3063 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3064 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3065 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3066}
3067
3068unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3069 switch (Cond) {
3070 case SIInstrInfo::SCC_TRUE:
3071 return AMDGPU::S_CBRANCH_SCC1;
3072 case SIInstrInfo::SCC_FALSE:
3073 return AMDGPU::S_CBRANCH_SCC0;
3074 case SIInstrInfo::VCCNZ:
3075 return AMDGPU::S_CBRANCH_VCCNZ;
3076 case SIInstrInfo::VCCZ:
3077 return AMDGPU::S_CBRANCH_VCCZ;
3078 case SIInstrInfo::EXECNZ:
3079 return AMDGPU::S_CBRANCH_EXECNZ;
3080 case SIInstrInfo::EXECZ:
3081 return AMDGPU::S_CBRANCH_EXECZ;
3082 default:
3083 llvm_unreachable("invalid branch predicate");
3084 }
3085}
3086
3087SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3088 switch (Opcode) {
3089 case AMDGPU::S_CBRANCH_SCC0:
3090 return SCC_FALSE;
3091 case AMDGPU::S_CBRANCH_SCC1:
3092 return SCC_TRUE;
3093 case AMDGPU::S_CBRANCH_VCCNZ:
3094 return VCCNZ;
3095 case AMDGPU::S_CBRANCH_VCCZ:
3096 return VCCZ;
3097 case AMDGPU::S_CBRANCH_EXECNZ:
3098 return EXECNZ;
3099 case AMDGPU::S_CBRANCH_EXECZ:
3100 return EXECZ;
3101 default:
3102 return INVALID_BR;
3103 }
3104}
3105
3109 MachineBasicBlock *&FBB,
3111 bool AllowModify) const {
3112 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3113 // Unconditional Branch
3114 TBB = I->getOperand(0).getMBB();
3115 return false;
3116 }
3117
3118 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3119 if (Pred == INVALID_BR)
3120 return true;
3121
3122 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3123 Cond.push_back(MachineOperand::CreateImm(Pred));
3124 Cond.push_back(I->getOperand(1)); // Save the branch register.
3125
3126 ++I;
3127
3128 if (I == MBB.end()) {
3129 // Conditional branch followed by fall-through.
3130 TBB = CondBB;
3131 return false;
3132 }
3133
3134 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3135 TBB = CondBB;
3136 FBB = I->getOperand(0).getMBB();
3137 return false;
3138 }
3139
3140 return true;
3141}
3142
3144 MachineBasicBlock *&FBB,
3146 bool AllowModify) const {
3147 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3148 auto E = MBB.end();
3149 if (I == E)
3150 return false;
3151
3152 // Skip over the instructions that are artificially terminators for special
3153 // exec management.
3154 while (I != E && !I->isBranch() && !I->isReturn()) {
3155 switch (I->getOpcode()) {
3156 case AMDGPU::S_MOV_B64_term:
3157 case AMDGPU::S_XOR_B64_term:
3158 case AMDGPU::S_OR_B64_term:
3159 case AMDGPU::S_ANDN2_B64_term:
3160 case AMDGPU::S_AND_B64_term:
3161 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3162 case AMDGPU::S_MOV_B32_term:
3163 case AMDGPU::S_XOR_B32_term:
3164 case AMDGPU::S_OR_B32_term:
3165 case AMDGPU::S_ANDN2_B32_term:
3166 case AMDGPU::S_AND_B32_term:
3167 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3168 break;
3169 case AMDGPU::SI_IF:
3170 case AMDGPU::SI_ELSE:
3171 case AMDGPU::SI_KILL_I1_TERMINATOR:
3172 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3173 // FIXME: It's messy that these need to be considered here at all.
3174 return true;
3175 default:
3176 llvm_unreachable("unexpected non-branch terminator inst");
3177 }
3178
3179 ++I;
3180 }
3181
3182 if (I == E)
3183 return false;
3184
3185 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3186}
3187
3189 int *BytesRemoved) const {
3190 unsigned Count = 0;
3191 unsigned RemovedSize = 0;
3192 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3193 // Skip over artificial terminators when removing instructions.
3194 if (MI.isBranch() || MI.isReturn()) {
3195 RemovedSize += getInstSizeInBytes(MI);
3196 MI.eraseFromParent();
3197 ++Count;
3198 }
3199 }
3200
3201 if (BytesRemoved)
3202 *BytesRemoved = RemovedSize;
3203
3204 return Count;
3205}
3206
3207// Copy the flags onto the implicit condition register operand.
3209 const MachineOperand &OrigCond) {
3210 CondReg.setIsUndef(OrigCond.isUndef());
3211 CondReg.setIsKill(OrigCond.isKill());
3212}
3213
3216 MachineBasicBlock *FBB,
3218 const DebugLoc &DL,
3219 int *BytesAdded) const {
3220 if (!FBB && Cond.empty()) {
3221 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3222 .addMBB(TBB);
3223 if (BytesAdded)
3224 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3225 return 1;
3226 }
3227
3228 assert(TBB && Cond[0].isImm());
3229
3230 unsigned Opcode
3231 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3232
3233 if (!FBB) {
3234 MachineInstr *CondBr =
3235 BuildMI(&MBB, DL, get(Opcode))
3236 .addMBB(TBB);
3237
3238 // Copy the flags onto the implicit condition register operand.
3239 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3240 fixImplicitOperands(*CondBr);
3241
3242 if (BytesAdded)
3243 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3244 return 1;
3245 }
3246
3247 assert(TBB && FBB);
3248
3249 MachineInstr *CondBr =
3250 BuildMI(&MBB, DL, get(Opcode))
3251 .addMBB(TBB);
3252 fixImplicitOperands(*CondBr);
3253 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3254 .addMBB(FBB);
3255
3256 MachineOperand &CondReg = CondBr->getOperand(1);
3257 CondReg.setIsUndef(Cond[1].isUndef());
3258 CondReg.setIsKill(Cond[1].isKill());
3259
3260 if (BytesAdded)
3261 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3262
3263 return 2;
3264}
3265
3268 if (Cond.size() != 2) {
3269 return true;
3270 }
3271
3272 if (Cond[0].isImm()) {
3273 Cond[0].setImm(-Cond[0].getImm());
3274 return false;
3275 }
3276
3277 return true;
3278}
3279
3282 Register DstReg, Register TrueReg,
3283 Register FalseReg, int &CondCycles,
3284 int &TrueCycles, int &FalseCycles) const {
3285 switch (Cond[0].getImm()) {
3286 case VCCNZ:
3287 case VCCZ: {
3288 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3289 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3290 if (MRI.getRegClass(FalseReg) != RC)
3291 return false;
3292
3293 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3294 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3295
3296 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3297 return RI.hasVGPRs(RC) && NumInsts <= 6;
3298 }
3299 case SCC_TRUE:
3300 case SCC_FALSE: {
3301 // FIXME: We could insert for VGPRs if we could replace the original compare
3302 // with a vector one.
3303 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3304 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3305 if (MRI.getRegClass(FalseReg) != RC)
3306 return false;
3307
3308 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3309
3310 // Multiples of 8 can do s_cselect_b64
3311 if (NumInsts % 2 == 0)
3312 NumInsts /= 2;
3313
3314 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3315 return RI.isSGPRClass(RC);
3316 }
3317 default:
3318 return false;
3319 }
3320}
3321
3325 Register TrueReg, Register FalseReg) const {
3326 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3327 if (Pred == VCCZ || Pred == SCC_FALSE) {
3328 Pred = static_cast<BranchPredicate>(-Pred);
3329 std::swap(TrueReg, FalseReg);
3330 }
3331
3332 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3333 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3334 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3335
3336 if (DstSize == 32) {
3338 if (Pred == SCC_TRUE) {
3339 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3340 .addReg(TrueReg)
3341 .addReg(FalseReg);
3342 } else {
3343 // Instruction's operands are backwards from what is expected.
3344 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3345 .addReg(FalseReg)
3346 .addReg(TrueReg);
3347 }
3348
3349 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3350 return;
3351 }
3352
3353 if (DstSize == 64 && Pred == SCC_TRUE) {
3355 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3356 .addReg(TrueReg)
3357 .addReg(FalseReg);
3358
3359 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3360 return;
3361 }
3362
3363 static const int16_t Sub0_15[] = {
3364 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3365 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3366 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3367 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3368 };
3369
3370 static const int16_t Sub0_15_64[] = {
3371 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3372 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3373 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3374 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3375 };
3376
3377 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3378 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3379 const int16_t *SubIndices = Sub0_15;
3380 int NElts = DstSize / 32;
3381
3382 // 64-bit select is only available for SALU.
3383 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3384 if (Pred == SCC_TRUE) {
3385 if (NElts % 2) {
3386 SelOp = AMDGPU::S_CSELECT_B32;
3387 EltRC = &AMDGPU::SGPR_32RegClass;
3388 } else {
3389 SelOp = AMDGPU::S_CSELECT_B64;
3390 EltRC = &AMDGPU::SGPR_64RegClass;
3391 SubIndices = Sub0_15_64;
3392 NElts /= 2;
3393 }
3394 }
3395
3397 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3398
3399 I = MIB->getIterator();
3400
3402 for (int Idx = 0; Idx != NElts; ++Idx) {
3403 Register DstElt = MRI.createVirtualRegister(EltRC);
3404 Regs.push_back(DstElt);
3405
3406 unsigned SubIdx = SubIndices[Idx];
3407
3409 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3410 Select =
3411 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3412 .addReg(FalseReg, 0, SubIdx)
3413 .addReg(TrueReg, 0, SubIdx);
3414 } else {
3415 Select =
3416 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3417 .addReg(TrueReg, 0, SubIdx)
3418 .addReg(FalseReg, 0, SubIdx);
3419 }
3420
3421 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3423
3424 MIB.addReg(DstElt)
3425 .addImm(SubIdx);
3426 }
3427}
3428
3430 switch (MI.getOpcode()) {
3431 case AMDGPU::V_MOV_B16_t16_e32:
3432 case AMDGPU::V_MOV_B16_t16_e64:
3433 case AMDGPU::V_MOV_B32_e32:
3434 case AMDGPU::V_MOV_B32_e64:
3435 case AMDGPU::V_MOV_B64_PSEUDO:
3436 case AMDGPU::V_MOV_B64_e32:
3437 case AMDGPU::V_MOV_B64_e64:
3438 case AMDGPU::S_MOV_B32:
3439 case AMDGPU::S_MOV_B64:
3440 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3441 case AMDGPU::COPY:
3442 case AMDGPU::WWM_COPY:
3443 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3444 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3445 case AMDGPU::V_ACCVGPR_MOV_B32:
3446 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3447 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3448 return true;
3449 default:
3450 return false;
3451 }
3452}
3453
3455 switch (MI.getOpcode()) {
3456 case AMDGPU::V_MOV_B16_t16_e32:
3457 case AMDGPU::V_MOV_B16_t16_e64:
3458 return 2;
3459 case AMDGPU::V_MOV_B32_e32:
3460 case AMDGPU::V_MOV_B32_e64:
3461 case AMDGPU::V_MOV_B64_PSEUDO:
3462 case AMDGPU::V_MOV_B64_e32:
3463 case AMDGPU::V_MOV_B64_e64:
3464 case AMDGPU::S_MOV_B32:
3465 case AMDGPU::S_MOV_B64:
3466 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3467 case AMDGPU::COPY:
3468 case AMDGPU::WWM_COPY:
3469 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3470 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3471 case AMDGPU::V_ACCVGPR_MOV_B32:
3472 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3473 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3474 return 1;
3475 default:
3476 llvm_unreachable("MI is not a foldable copy");
3477 }
3478}
3479
3480static constexpr AMDGPU::OpName ModifierOpNames[] = {
3481 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3482 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3483 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3484
3486 unsigned Opc = MI.getOpcode();
3487 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3488 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3489 if (Idx >= 0)
3490 MI.removeOperand(Idx);
3491 }
3492}
3493
3495 const MCInstrDesc &NewDesc) const {
3496 MI.setDesc(NewDesc);
3497
3498 // Remove any leftover implicit operands from mutating the instruction. e.g.
3499 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3500 // anymore.
3501 const MCInstrDesc &Desc = MI.getDesc();
3502 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3503 Desc.implicit_defs().size();
3504
3505 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3506 MI.removeOperand(I);
3507}
3508
3509std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3510 unsigned SubRegIndex) {
3511 switch (SubRegIndex) {
3512 case AMDGPU::NoSubRegister:
3513 return Imm;
3514 case AMDGPU::sub0:
3515 return SignExtend64<32>(Imm);
3516 case AMDGPU::sub1:
3517 return SignExtend64<32>(Imm >> 32);
3518 case AMDGPU::lo16:
3519 return SignExtend64<16>(Imm);
3520 case AMDGPU::hi16:
3521 return SignExtend64<16>(Imm >> 16);
3522 case AMDGPU::sub1_lo16:
3523 return SignExtend64<16>(Imm >> 32);
3524 case AMDGPU::sub1_hi16:
3525 return SignExtend64<16>(Imm >> 48);
3526 default:
3527 return std::nullopt;
3528 }
3529
3530 llvm_unreachable("covered subregister switch");
3531}
3532
3533static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3534 switch (Opc) {
3535 case AMDGPU::V_MAC_F16_e32:
3536 case AMDGPU::V_MAC_F16_e64:
3537 case AMDGPU::V_MAD_F16_e64:
3538 return AMDGPU::V_MADAK_F16;
3539 case AMDGPU::V_MAC_F32_e32:
3540 case AMDGPU::V_MAC_F32_e64:
3541 case AMDGPU::V_MAD_F32_e64:
3542 return AMDGPU::V_MADAK_F32;
3543 case AMDGPU::V_FMAC_F32_e32:
3544 case AMDGPU::V_FMAC_F32_e64:
3545 case AMDGPU::V_FMA_F32_e64:
3546 return AMDGPU::V_FMAAK_F32;
3547 case AMDGPU::V_FMAC_F16_e32:
3548 case AMDGPU::V_FMAC_F16_e64:
3549 case AMDGPU::V_FMAC_F16_t16_e64:
3550 case AMDGPU::V_FMAC_F16_fake16_e64:
3551 case AMDGPU::V_FMAC_F16_t16_e32:
3552 case AMDGPU::V_FMAC_F16_fake16_e32:
3553 case AMDGPU::V_FMA_F16_e64:
3554 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3555 ? AMDGPU::V_FMAAK_F16_t16
3556 : AMDGPU::V_FMAAK_F16_fake16
3557 : AMDGPU::V_FMAAK_F16;
3558 case AMDGPU::V_FMAC_F64_e32:
3559 case AMDGPU::V_FMAC_F64_e64:
3560 case AMDGPU::V_FMA_F64_e64:
3561 return AMDGPU::V_FMAAK_F64;
3562 default:
3563 llvm_unreachable("invalid instruction");
3564 }
3565}
3566
3567static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3568 switch (Opc) {
3569 case AMDGPU::V_MAC_F16_e32:
3570 case AMDGPU::V_MAC_F16_e64:
3571 case AMDGPU::V_MAD_F16_e64:
3572 return AMDGPU::V_MADMK_F16;
3573 case AMDGPU::V_MAC_F32_e32:
3574 case AMDGPU::V_MAC_F32_e64:
3575 case AMDGPU::V_MAD_F32_e64:
3576 return AMDGPU::V_MADMK_F32;
3577 case AMDGPU::V_FMAC_F32_e32:
3578 case AMDGPU::V_FMAC_F32_e64:
3579 case AMDGPU::V_FMA_F32_e64:
3580 return AMDGPU::V_FMAMK_F32;
3581 case AMDGPU::V_FMAC_F16_e32:
3582 case AMDGPU::V_FMAC_F16_e64:
3583 case AMDGPU::V_FMAC_F16_t16_e64:
3584 case AMDGPU::V_FMAC_F16_fake16_e64:
3585 case AMDGPU::V_FMAC_F16_t16_e32:
3586 case AMDGPU::V_FMAC_F16_fake16_e32:
3587 case AMDGPU::V_FMA_F16_e64:
3588 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3589 ? AMDGPU::V_FMAMK_F16_t16
3590 : AMDGPU::V_FMAMK_F16_fake16
3591 : AMDGPU::V_FMAMK_F16;
3592 case AMDGPU::V_FMAC_F64_e32:
3593 case AMDGPU::V_FMAC_F64_e64:
3594 case AMDGPU::V_FMA_F64_e64:
3595 return AMDGPU::V_FMAMK_F64;
3596 default:
3597 llvm_unreachable("invalid instruction");
3598 }
3599}
3600
3602 Register Reg, MachineRegisterInfo *MRI) const {
3603 int64_t Imm;
3604 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3605 return false;
3606
3607 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3608
3609 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3610
3611 unsigned Opc = UseMI.getOpcode();
3612 if (Opc == AMDGPU::COPY) {
3613 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3614
3615 Register DstReg = UseMI.getOperand(0).getReg();
3616 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3617
3618 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3619
3620 if (HasMultipleUses) {
3621 // TODO: This should fold in more cases with multiple use, but we need to
3622 // more carefully consider what those uses are.
3623 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3624
3625 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3626 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3627 return false;
3628
3629 // Most of the time folding a 32-bit inline constant is free (though this
3630 // might not be true if we can't later fold it into a real user).
3631 //
3632 // FIXME: This isInlineConstant check is imprecise if
3633 // getConstValDefinedInReg handled the tricky non-mov cases.
3634 if (ImmDefSize == 32 &&
3636 return false;
3637 }
3638
3639 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3640 RI.getSubRegIdxSize(UseSubReg) == 16;
3641
3642 if (Is16Bit) {
3643 if (RI.hasVGPRs(DstRC))
3644 return false; // Do not clobber vgpr_hi16
3645
3646 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3647 return false;
3648 }
3649
3650 MachineFunction *MF = UseMI.getMF();
3651
3652 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3653 MCRegister MovDstPhysReg =
3654 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3655
3656 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3657
3658 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3659 for (unsigned MovOp :
3660 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3661 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3662 const MCInstrDesc &MovDesc = get(MovOp);
3663
3664 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3665 if (Is16Bit) {
3666 // We just need to find a correctly sized register class, so the
3667 // subregister index compatibility doesn't matter since we're statically
3668 // extracting the immediate value.
3669 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3670 if (!MovDstRC)
3671 continue;
3672
3673 if (MovDstPhysReg) {
3674 // FIXME: We probably should not do this. If there is a live value in
3675 // the high half of the register, it will be corrupted.
3676 MovDstPhysReg =
3677 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3678 if (!MovDstPhysReg)
3679 continue;
3680 }
3681 }
3682
3683 // Result class isn't the right size, try the next instruction.
3684 if (MovDstPhysReg) {
3685 if (!MovDstRC->contains(MovDstPhysReg))
3686 return false;
3687 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3688 // TODO: This will be overly conservative in the case of 16-bit virtual
3689 // SGPRs. We could hack up the virtual register uses to use a compatible
3690 // 32-bit class.
3691 continue;
3692 }
3693
3694 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3695
3696 // Ensure the interpreted immediate value is a valid operand in the new
3697 // mov.
3698 //
3699 // FIXME: isImmOperandLegal should have form that doesn't require existing
3700 // MachineInstr or MachineOperand
3701 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3702 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3703 break;
3704
3705 NewOpc = MovOp;
3706 break;
3707 }
3708
3709 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3710 return false;
3711
3712 if (Is16Bit) {
3713 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3714 if (MovDstPhysReg)
3715 UseMI.getOperand(0).setReg(MovDstPhysReg);
3716 assert(UseMI.getOperand(1).getReg().isVirtual());
3717 }
3718
3719 const MCInstrDesc &NewMCID = get(NewOpc);
3720 UseMI.setDesc(NewMCID);
3721 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3722 UseMI.addImplicitDefUseOperands(*MF);
3723 return true;
3724 }
3725
3726 if (HasMultipleUses)
3727 return false;
3728
3729 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3730 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3731 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3732 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3733 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3734 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3735 Opc == AMDGPU::V_FMAC_F64_e64) {
3736 // Don't fold if we are using source or output modifiers. The new VOP2
3737 // instructions don't have them.
3739 return false;
3740
3741 // If this is a free constant, there's no reason to do this.
3742 // TODO: We could fold this here instead of letting SIFoldOperands do it
3743 // later.
3744 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3745
3746 // Any src operand can be used for the legality check.
3747 if (isInlineConstant(UseMI, Src0Idx, Imm))
3748 return false;
3749
3750 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3751
3752 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3753 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3754
3755 auto CopyRegOperandToNarrowerRC =
3756 [MRI, this](MachineInstr &MI, unsigned OpNo,
3757 const TargetRegisterClass *NewRC) -> void {
3758 if (!MI.getOperand(OpNo).isReg())
3759 return;
3760 Register Reg = MI.getOperand(OpNo).getReg();
3761 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3762 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3763 return;
3764 Register Tmp = MRI->createVirtualRegister(NewRC);
3765 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3766 get(AMDGPU::COPY), Tmp)
3767 .addReg(Reg);
3768 MI.getOperand(OpNo).setReg(Tmp);
3769 MI.getOperand(OpNo).setIsKill();
3770 };
3771
3772 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3773 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3774 (Src1->isReg() && Src1->getReg() == Reg)) {
3775 MachineOperand *RegSrc =
3776 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3777 if (!RegSrc->isReg())
3778 return false;
3779 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3780 ST.getConstantBusLimit(Opc) < 2)
3781 return false;
3782
3783 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3784 return false;
3785
3786 // If src2 is also a literal constant then we have to choose which one to
3787 // fold. In general it is better to choose madak so that the other literal
3788 // can be materialized in an sgpr instead of a vgpr:
3789 // s_mov_b32 s0, literal
3790 // v_madak_f32 v0, s0, v0, literal
3791 // Instead of:
3792 // v_mov_b32 v1, literal
3793 // v_madmk_f32 v0, v0, literal, v1
3794 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3795 if (Def && Def->isMoveImmediate() &&
3796 !isInlineConstant(Def->getOperand(1)))
3797 return false;
3798
3799 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3800 if (pseudoToMCOpcode(NewOpc) == -1)
3801 return false;
3802
3803 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3804 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3805
3806 // FIXME: This would be a lot easier if we could return a new instruction
3807 // instead of having to modify in place.
3808
3809 Register SrcReg = RegSrc->getReg();
3810 unsigned SrcSubReg = RegSrc->getSubReg();
3811 Src0->setReg(SrcReg);
3812 Src0->setSubReg(SrcSubReg);
3813 Src0->setIsKill(RegSrc->isKill());
3814
3815 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3816 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3817 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3818 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3819 UseMI.untieRegOperand(
3820 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3821
3822 Src1->ChangeToImmediate(*SubRegImm);
3823
3825 UseMI.setDesc(get(NewOpc));
3826
3827 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3828 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3829 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3830 Register Tmp = MRI->createVirtualRegister(NewRC);
3831 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3832 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3833 UseMI.getOperand(0).getReg())
3834 .addReg(Tmp, RegState::Kill);
3835 UseMI.getOperand(0).setReg(Tmp);
3836 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3837 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3838 }
3839
3840 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3841 if (DeleteDef)
3842 DefMI.eraseFromParent();
3843
3844 return true;
3845 }
3846
3847 // Added part is the constant: Use v_madak_{f16, f32}.
3848 if (Src2->isReg() && Src2->getReg() == Reg) {
3849 if (ST.getConstantBusLimit(Opc) < 2) {
3850 // Not allowed to use constant bus for another operand.
3851 // We can however allow an inline immediate as src0.
3852 bool Src0Inlined = false;
3853 if (Src0->isReg()) {
3854 // Try to inline constant if possible.
3855 // If the Def moves immediate and the use is single
3856 // We are saving VGPR here.
3857 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3858 if (Def && Def->isMoveImmediate() &&
3859 isInlineConstant(Def->getOperand(1)) &&
3860 MRI->hasOneNonDBGUse(Src0->getReg())) {
3861 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3862 Src0Inlined = true;
3863 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3864 RI.isSGPRReg(*MRI, Src0->getReg())) {
3865 return false;
3866 }
3867 // VGPR is okay as Src0 - fallthrough
3868 }
3869
3870 if (Src1->isReg() && !Src0Inlined) {
3871 // We have one slot for inlinable constant so far - try to fill it
3872 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3873 if (Def && Def->isMoveImmediate() &&
3874 isInlineConstant(Def->getOperand(1)) &&
3875 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3876 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3877 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3878 return false;
3879 // VGPR is okay as Src1 - fallthrough
3880 }
3881 }
3882
3883 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3884 if (pseudoToMCOpcode(NewOpc) == -1)
3885 return false;
3886
3887 // FIXME: This would be a lot easier if we could return a new instruction
3888 // instead of having to modify in place.
3889
3890 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3891 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3892 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3893 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3894 UseMI.untieRegOperand(
3895 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3896
3897 const std::optional<int64_t> SubRegImm =
3898 extractSubregFromImm(Imm, Src2->getSubReg());
3899
3900 // ChangingToImmediate adds Src2 back to the instruction.
3901 Src2->ChangeToImmediate(*SubRegImm);
3902
3903 // These come before src2.
3905 UseMI.setDesc(get(NewOpc));
3906
3907 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3908 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3909 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3910 Register Tmp = MRI->createVirtualRegister(NewRC);
3911 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3912 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3913 UseMI.getOperand(0).getReg())
3914 .addReg(Tmp, RegState::Kill);
3915 UseMI.getOperand(0).setReg(Tmp);
3916 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3917 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
3918 }
3919
3920 // It might happen that UseMI was commuted
3921 // and we now have SGPR as SRC1. If so 2 inlined
3922 // constant and SGPR are illegal.
3924
3925 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3926 if (DeleteDef)
3927 DefMI.eraseFromParent();
3928
3929 return true;
3930 }
3931 }
3932
3933 return false;
3934}
3935
3936static bool
3939 if (BaseOps1.size() != BaseOps2.size())
3940 return false;
3941 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3942 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3943 return false;
3944 }
3945 return true;
3946}
3947
3948static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3949 LocationSize WidthB, int OffsetB) {
3950 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3951 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3952 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3953 return LowWidth.hasValue() &&
3954 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3955}
3956
3957bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3958 const MachineInstr &MIb) const {
3959 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3960 int64_t Offset0, Offset1;
3961 LocationSize Dummy0 = LocationSize::precise(0);
3962 LocationSize Dummy1 = LocationSize::precise(0);
3963 bool Offset0IsScalable, Offset1IsScalable;
3964 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3965 Dummy0, &RI) ||
3966 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3967 Dummy1, &RI))
3968 return false;
3969
3970 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3971 return false;
3972
3973 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3974 // FIXME: Handle ds_read2 / ds_write2.
3975 return false;
3976 }
3977 LocationSize Width0 = MIa.memoperands().front()->getSize();
3978 LocationSize Width1 = MIb.memoperands().front()->getSize();
3979 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3980}
3981
3983 const MachineInstr &MIb) const {
3984 assert(MIa.mayLoadOrStore() &&
3985 "MIa must load from or modify a memory location");
3986 assert(MIb.mayLoadOrStore() &&
3987 "MIb must load from or modify a memory location");
3988
3990 return false;
3991
3992 // XXX - Can we relax this between address spaces?
3993 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3994 return false;
3995
3996 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3997 return false;
3998
3999 if (MIa.isBundle() || MIb.isBundle())
4000 return false;
4001
4002 // TODO: Should we check the address space from the MachineMemOperand? That
4003 // would allow us to distinguish objects we know don't alias based on the
4004 // underlying address space, even if it was lowered to a different one,
4005 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4006 // buffer.
4007 if (isDS(MIa)) {
4008 if (isDS(MIb))
4009 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4010
4011 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4012 }
4013
4014 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4015 if (isMUBUF(MIb) || isMTBUF(MIb))
4016 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4017
4018 if (isFLAT(MIb))
4019 return isFLATScratch(MIb);
4020
4021 return !isSMRD(MIb);
4022 }
4023
4024 if (isSMRD(MIa)) {
4025 if (isSMRD(MIb))
4026 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4027
4028 if (isFLAT(MIb))
4029 return isFLATScratch(MIb);
4030
4031 return !isMUBUF(MIb) && !isMTBUF(MIb);
4032 }
4033
4034 if (isFLAT(MIa)) {
4035 if (isFLAT(MIb)) {
4036 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4037 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4038 return true;
4039
4040 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4041 }
4042
4043 return false;
4044 }
4045
4046 return false;
4047}
4048
4050 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4051 if (Reg.isPhysical())
4052 return false;
4053 auto *Def = MRI.getUniqueVRegDef(Reg);
4054 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4055 Imm = Def->getOperand(1).getImm();
4056 if (DefMI)
4057 *DefMI = Def;
4058 return true;
4059 }
4060 return false;
4061}
4062
4063static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4064 MachineInstr **DefMI = nullptr) {
4065 if (!MO->isReg())
4066 return false;
4067 const MachineFunction *MF = MO->getParent()->getMF();
4068 const MachineRegisterInfo &MRI = MF->getRegInfo();
4069 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4070}
4071
4073 MachineInstr &NewMI) {
4074 if (LV) {
4075 unsigned NumOps = MI.getNumOperands();
4076 for (unsigned I = 1; I < NumOps; ++I) {
4077 MachineOperand &Op = MI.getOperand(I);
4078 if (Op.isReg() && Op.isKill())
4079 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4080 }
4081 }
4082}
4083
4084static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4085 switch (Opc) {
4086 case AMDGPU::V_MAC_F16_e32:
4087 case AMDGPU::V_MAC_F16_e64:
4088 return AMDGPU::V_MAD_F16_e64;
4089 case AMDGPU::V_MAC_F32_e32:
4090 case AMDGPU::V_MAC_F32_e64:
4091 return AMDGPU::V_MAD_F32_e64;
4092 case AMDGPU::V_MAC_LEGACY_F32_e32:
4093 case AMDGPU::V_MAC_LEGACY_F32_e64:
4094 return AMDGPU::V_MAD_LEGACY_F32_e64;
4095 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4096 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4097 return AMDGPU::V_FMA_LEGACY_F32_e64;
4098 case AMDGPU::V_FMAC_F16_e32:
4099 case AMDGPU::V_FMAC_F16_e64:
4100 case AMDGPU::V_FMAC_F16_t16_e64:
4101 case AMDGPU::V_FMAC_F16_fake16_e64:
4102 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4103 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4104 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4105 : AMDGPU::V_FMA_F16_gfx9_e64;
4106 case AMDGPU::V_FMAC_F32_e32:
4107 case AMDGPU::V_FMAC_F32_e64:
4108 return AMDGPU::V_FMA_F32_e64;
4109 case AMDGPU::V_FMAC_F64_e32:
4110 case AMDGPU::V_FMAC_F64_e64:
4111 return AMDGPU::V_FMA_F64_e64;
4112 default:
4113 llvm_unreachable("invalid instruction");
4114 }
4115}
4116
4117/// Helper struct for the implementation of 3-address conversion to communicate
4118/// updates made to instruction operands.
4120 /// Other instruction whose def is no longer used by the converted
4121 /// instruction.
4123};
4124
4126 LiveVariables *LV,
4127 LiveIntervals *LIS) const {
4128 MachineBasicBlock &MBB = *MI.getParent();
4129 MachineInstr *CandidateMI = &MI;
4130
4131 if (MI.isBundle()) {
4132 // This is a temporary placeholder for bundle handling that enables us to
4133 // exercise the relevant code paths in the two-address instruction pass.
4134 if (MI.getBundleSize() != 1)
4135 return nullptr;
4136 CandidateMI = MI.getNextNode();
4137 }
4138
4140 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4141 if (!NewMI)
4142 return nullptr;
4143
4144 if (MI.isBundle()) {
4145 CandidateMI->eraseFromBundle();
4146
4147 for (MachineOperand &MO : MI.all_defs()) {
4148 if (MO.isTied())
4149 MI.untieRegOperand(MO.getOperandNo());
4150 }
4151 } else {
4152 updateLiveVariables(LV, MI, *NewMI);
4153 if (LIS) {
4154 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4155 // SlotIndex of defs needs to be updated when converting to early-clobber
4156 MachineOperand &Def = NewMI->getOperand(0);
4157 if (Def.isEarlyClobber() && Def.isReg() &&
4158 LIS->hasInterval(Def.getReg())) {
4159 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4160 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4161 auto &LI = LIS->getInterval(Def.getReg());
4162 auto UpdateDefIndex = [&](LiveRange &LR) {
4163 auto *S = LR.find(OldIndex);
4164 if (S != LR.end() && S->start == OldIndex) {
4165 assert(S->valno && S->valno->def == OldIndex);
4166 S->start = NewIndex;
4167 S->valno->def = NewIndex;
4168 }
4169 };
4170 UpdateDefIndex(LI);
4171 for (auto &SR : LI.subranges())
4172 UpdateDefIndex(SR);
4173 }
4174 }
4175 }
4176
4177 if (U.RemoveMIUse) {
4178 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4179 // The only user is the instruction which will be killed.
4180 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4181
4182 if (MRI.hasOneNonDBGUse(DefReg)) {
4183 // We cannot just remove the DefMI here, calling pass will crash.
4184 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4185 U.RemoveMIUse->getOperand(0).setIsDead(true);
4186 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4187 U.RemoveMIUse->removeOperand(I);
4188 if (LV)
4189 LV->getVarInfo(DefReg).AliveBlocks.clear();
4190 }
4191
4192 if (MI.isBundle()) {
4193 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4194 if (!VRI.Reads && !VRI.Writes) {
4195 for (MachineOperand &MO : MI.all_uses()) {
4196 if (MO.isReg() && MO.getReg() == DefReg) {
4197 assert(MO.getSubReg() == 0 &&
4198 "tied sub-registers in bundles currently not supported");
4199 MI.removeOperand(MO.getOperandNo());
4200 break;
4201 }
4202 }
4203
4204 if (LIS)
4205 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4206 }
4207 } else if (LIS) {
4208 LiveInterval &DefLI = LIS->getInterval(DefReg);
4209
4210 // We cannot delete the original instruction here, so hack out the use
4211 // in the original instruction with a dummy register so we can use
4212 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4213 // not have the complexity of deleting a use to consider here.
4214 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4215 for (MachineOperand &MIOp : MI.uses()) {
4216 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4217 MIOp.setIsUndef(true);
4218 MIOp.setReg(DummyReg);
4219 }
4220 }
4221
4222 if (MI.isBundle()) {
4223 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4224 if (!VRI.Reads && !VRI.Writes) {
4225 for (MachineOperand &MIOp : MI.uses()) {
4226 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4227 MIOp.setIsUndef(true);
4228 MIOp.setReg(DummyReg);
4229 }
4230 }
4231 }
4232
4233 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4234 false, /*isUndef=*/true));
4235 }
4236
4237 LIS->shrinkToUses(&DefLI);
4238 }
4239 }
4240
4241 return MI.isBundle() ? &MI : NewMI;
4242}
4243
4245SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4246 ThreeAddressUpdates &U) const {
4247 MachineBasicBlock &MBB = *MI.getParent();
4248 unsigned Opc = MI.getOpcode();
4249
4250 // Handle MFMA.
4251 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4252 if (NewMFMAOpc != -1) {
4254 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4255 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4256 MIB.add(MI.getOperand(I));
4257 return MIB;
4258 }
4259
4260 if (SIInstrInfo::isWMMA(MI)) {
4261 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4262 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4263 .setMIFlags(MI.getFlags());
4264 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4265 MIB->addOperand(MI.getOperand(I));
4266 return MIB;
4267 }
4268
4269 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4270 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4271 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4272 "present pre-RA");
4273
4274 // Handle MAC/FMAC.
4275 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4276 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4277 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4278 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4279 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4280 bool Src0Literal = false;
4281
4282 switch (Opc) {
4283 default:
4284 return nullptr;
4285 case AMDGPU::V_MAC_F16_e64:
4286 case AMDGPU::V_FMAC_F16_e64:
4287 case AMDGPU::V_FMAC_F16_t16_e64:
4288 case AMDGPU::V_FMAC_F16_fake16_e64:
4289 case AMDGPU::V_MAC_F32_e64:
4290 case AMDGPU::V_MAC_LEGACY_F32_e64:
4291 case AMDGPU::V_FMAC_F32_e64:
4292 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4293 case AMDGPU::V_FMAC_F64_e64:
4294 break;
4295 case AMDGPU::V_MAC_F16_e32:
4296 case AMDGPU::V_FMAC_F16_e32:
4297 case AMDGPU::V_MAC_F32_e32:
4298 case AMDGPU::V_MAC_LEGACY_F32_e32:
4299 case AMDGPU::V_FMAC_F32_e32:
4300 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4301 case AMDGPU::V_FMAC_F64_e32: {
4302 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4303 AMDGPU::OpName::src0);
4304 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4305 if (!Src0->isReg() && !Src0->isImm())
4306 return nullptr;
4307
4308 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4309 Src0Literal = true;
4310
4311 break;
4312 }
4313 }
4314
4315 MachineInstrBuilder MIB;
4316 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4317 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4318 const MachineOperand *Src0Mods =
4319 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4320 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4321 const MachineOperand *Src1Mods =
4322 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4323 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4324 const MachineOperand *Src2Mods =
4325 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4326 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4327 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4328 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4329
4330 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4331 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4332 // If we have an SGPR input, we will violate the constant bus restriction.
4333 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4334 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4335 MachineInstr *DefMI;
4336
4337 int64_t Imm;
4338 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4339 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4340 if (pseudoToMCOpcode(NewOpc) != -1) {
4341 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4342 .add(*Dst)
4343 .add(*Src0)
4344 .add(*Src1)
4345 .addImm(Imm)
4346 .setMIFlags(MI.getFlags());
4347 U.RemoveMIUse = DefMI;
4348 return MIB;
4349 }
4350 }
4351 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4352 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4353 if (pseudoToMCOpcode(NewOpc) != -1) {
4354 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4355 .add(*Dst)
4356 .add(*Src0)
4357 .addImm(Imm)
4358 .add(*Src2)
4359 .setMIFlags(MI.getFlags());
4360 U.RemoveMIUse = DefMI;
4361 return MIB;
4362 }
4363 }
4364 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4365 if (Src0Literal) {
4366 Imm = Src0->getImm();
4367 DefMI = nullptr;
4368 }
4369 if (pseudoToMCOpcode(NewOpc) != -1 &&
4371 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4372 Src1)) {
4373 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4374 .add(*Dst)
4375 .add(*Src1)
4376 .addImm(Imm)
4377 .add(*Src2)
4378 .setMIFlags(MI.getFlags());
4379 U.RemoveMIUse = DefMI;
4380 return MIB;
4381 }
4382 }
4383 }
4384
4385 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4386 // if VOP3 does not allow a literal operand.
4387 if (Src0Literal && !ST.hasVOP3Literal())
4388 return nullptr;
4389
4390 unsigned NewOpc = getNewFMAInst(ST, Opc);
4391
4392 if (pseudoToMCOpcode(NewOpc) == -1)
4393 return nullptr;
4394
4395 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4396 .add(*Dst)
4397 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4398 .add(*Src0)
4399 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4400 .add(*Src1)
4401 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4402 .add(*Src2)
4403 .addImm(Clamp ? Clamp->getImm() : 0)
4404 .addImm(Omod ? Omod->getImm() : 0)
4405 .setMIFlags(MI.getFlags());
4406 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4407 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4408 return MIB;
4409}
4410
4411// It's not generally safe to move VALU instructions across these since it will
4412// start using the register as a base index rather than directly.
4413// XXX - Why isn't hasSideEffects sufficient for these?
4415 switch (MI.getOpcode()) {
4416 case AMDGPU::S_SET_GPR_IDX_ON:
4417 case AMDGPU::S_SET_GPR_IDX_MODE:
4418 case AMDGPU::S_SET_GPR_IDX_OFF:
4419 return true;
4420 default:
4421 return false;
4422 }
4423}
4424
4426 const MachineBasicBlock *MBB,
4427 const MachineFunction &MF) const {
4428 // Skipping the check for SP writes in the base implementation. The reason it
4429 // was added was apparently due to compile time concerns.
4430 //
4431 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4432 // but is probably avoidable.
4433
4434 // Copied from base implementation.
4435 // Terminators and labels can't be scheduled around.
4436 if (MI.isTerminator() || MI.isPosition())
4437 return true;
4438
4439 // INLINEASM_BR can jump to another block
4440 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4441 return true;
4442
4443 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4444 return true;
4445
4446 // Target-independent instructions do not have an implicit-use of EXEC, even
4447 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4448 // boundaries prevents incorrect movements of such instructions.
4449 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4450 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4451 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4452 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4453 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4455}
4456
4458 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4459 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4460 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4461}
4462
4464 // Instructions that access scratch use FLAT encoding or BUF encodings.
4465 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4466 return false;
4467
4468 // If scratch is not initialized, we can never access it.
4469 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4470 return false;
4471
4472 // SCRATCH instructions always access scratch.
4473 if (isFLATScratch(MI))
4474 return true;
4475
4476 // If there are no memory operands then conservatively assume the flat
4477 // operation may access scratch.
4478 if (MI.memoperands_empty())
4479 return true;
4480
4481 // See if any memory operand specifies an address space that involves scratch.
4482 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4483 unsigned AS = Memop->getAddrSpace();
4484 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4485 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4486 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4487 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4488 }
4489 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4490 });
4491}
4492
4494 assert(isFLAT(MI));
4495
4496 // All flat instructions use the VMEM counter except prefetch.
4497 if (!usesVM_CNT(MI))
4498 return false;
4499
4500 // If there are no memory operands then conservatively assume the flat
4501 // operation may access VMEM.
4502 if (MI.memoperands_empty())
4503 return true;
4504
4505 // See if any memory operand specifies an address space that involves VMEM.
4506 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4507 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4508 // (GDS) address space is not supported by flat operations. Therefore, simply
4509 // return true unless only the LDS address space is found.
4510 for (const MachineMemOperand *Memop : MI.memoperands()) {
4511 unsigned AS = Memop->getAddrSpace();
4513 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4514 return true;
4515 }
4516
4517 return false;
4518}
4519
4521 assert(isFLAT(MI));
4522
4523 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4524 if (!usesLGKM_CNT(MI))
4525 return false;
4526
4527 // If in tgsplit mode then there can be no use of LDS.
4528 if (ST.isTgSplitEnabled())
4529 return false;
4530
4531 // If there are no memory operands then conservatively assume the flat
4532 // operation may access LDS.
4533 if (MI.memoperands_empty())
4534 return true;
4535
4536 // See if any memory operand specifies an address space that involves LDS.
4537 for (const MachineMemOperand *Memop : MI.memoperands()) {
4538 unsigned AS = Memop->getAddrSpace();
4540 return true;
4541 }
4542
4543 return false;
4544}
4545
4547 // Skip the full operand and register alias search modifiesRegister
4548 // does. There's only a handful of instructions that touch this, it's only an
4549 // implicit def, and doesn't alias any other registers.
4550 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4551}
4552
4554 unsigned Opcode = MI.getOpcode();
4555
4556 if (MI.mayStore() && isSMRD(MI))
4557 return true; // scalar store or atomic
4558
4559 // This will terminate the function when other lanes may need to continue.
4560 if (MI.isReturn())
4561 return true;
4562
4563 // These instructions cause shader I/O that may cause hardware lockups
4564 // when executed with an empty EXEC mask.
4565 //
4566 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4567 // EXEC = 0, but checking for that case here seems not worth it
4568 // given the typical code patterns.
4569 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4570 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4571 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4572 return true;
4573
4574 if (MI.isCall() || MI.isInlineAsm())
4575 return true; // conservative assumption
4576
4577 // Assume that barrier interactions are only intended with active lanes.
4578 if (isBarrier(Opcode))
4579 return true;
4580
4581 // A mode change is a scalar operation that influences vector instructions.
4583 return true;
4584
4585 // These are like SALU instructions in terms of effects, so it's questionable
4586 // whether we should return true for those.
4587 //
4588 // However, executing them with EXEC = 0 causes them to operate on undefined
4589 // data, which we avoid by returning true here.
4590 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4591 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4592 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4593 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4594 return true;
4595
4596 return false;
4597}
4598
4600 const MachineInstr &MI) const {
4601 if (MI.isMetaInstruction())
4602 return false;
4603
4604 // This won't read exec if this is an SGPR->SGPR copy.
4605 if (MI.isCopyLike()) {
4606 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4607 return true;
4608
4609 // Make sure this isn't copying exec as a normal operand
4610 return MI.readsRegister(AMDGPU::EXEC, &RI);
4611 }
4612
4613 // Make a conservative assumption about the callee.
4614 if (MI.isCall())
4615 return true;
4616
4617 // Be conservative with any unhandled generic opcodes.
4618 if (!isTargetSpecificOpcode(MI.getOpcode()))
4619 return true;
4620
4621 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4622}
4623
4624bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4625 switch (Imm.getBitWidth()) {
4626 case 1: // This likely will be a condition code mask.
4627 return true;
4628
4629 case 32:
4630 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4631 ST.hasInv2PiInlineImm());
4632 case 64:
4633 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4634 ST.hasInv2PiInlineImm());
4635 case 16:
4636 return ST.has16BitInsts() &&
4637 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4638 ST.hasInv2PiInlineImm());
4639 default:
4640 llvm_unreachable("invalid bitwidth");
4641 }
4642}
4643
4645 APInt IntImm = Imm.bitcastToAPInt();
4646 int64_t IntImmVal = IntImm.getSExtValue();
4647 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4648 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4649 default:
4650 llvm_unreachable("invalid fltSemantics");
4653 return isInlineConstant(IntImm);
4655 return ST.has16BitInsts() &&
4656 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4658 return ST.has16BitInsts() &&
4659 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4660 }
4661}
4662
4663bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4664 // MachineOperand provides no way to tell the true operand size, since it only
4665 // records a 64-bit value. We need to know the size to determine if a 32-bit
4666 // floating point immediate bit pattern is legal for an integer immediate. It
4667 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4668 switch (OperandType) {
4678 int32_t Trunc = static_cast<int32_t>(Imm);
4679 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4680 }
4686 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4689 // We would expect inline immediates to not be concerned with an integer/fp
4690 // distinction. However, in the case of 16-bit integer operations, the
4691 // "floating point" values appear to not work. It seems read the low 16-bits
4692 // of 32-bit immediates, which happens to always work for the integer
4693 // values.
4694 //
4695 // See llvm bugzilla 46302.
4696 //
4697 // TODO: Theoretically we could use op-sel to use the high bits of the
4698 // 32-bit FP values.
4707 return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
4712 return false;
4715 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4716 // A few special case instructions have 16-bit operands on subtargets
4717 // where 16-bit instructions are not legal.
4718 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4719 // constants in these cases
4720 int16_t Trunc = static_cast<int16_t>(Imm);
4721 return ST.has16BitInsts() &&
4722 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4723 }
4724
4725 return false;
4726 }
4729 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4730 int16_t Trunc = static_cast<int16_t>(Imm);
4731 return ST.has16BitInsts() &&
4732 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4733 }
4734 return false;
4735 }
4739 return false;
4741 return isLegalAV64PseudoImm(Imm);
4744 // Always embedded in the instruction for free.
4745 return true;
4755 // Just ignore anything else.
4756 return true;
4757 default:
4758 llvm_unreachable("invalid operand type");
4759 }
4760}
4761
4762static bool compareMachineOp(const MachineOperand &Op0,
4763 const MachineOperand &Op1) {
4764 if (Op0.getType() != Op1.getType())
4765 return false;
4766
4767 switch (Op0.getType()) {
4769 return Op0.getReg() == Op1.getReg();
4771 return Op0.getImm() == Op1.getImm();
4772 default:
4773 llvm_unreachable("Didn't expect to be comparing these operand types");
4774 }
4775}
4776
4778 const MCOperandInfo &OpInfo) const {
4779 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4780 return true;
4781
4782 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4783 return false;
4784
4785 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4786 return true;
4787
4788 return ST.hasVOP3Literal();
4789}
4790
4791bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4792 int64_t ImmVal) const {
4793 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4794 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4795 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4796 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4797 AMDGPU::OpName::src2))
4798 return false;
4799 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4800 }
4801
4802 return isLiteralOperandLegal(InstDesc, OpInfo);
4803}
4804
4805bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4806 const MachineOperand &MO) const {
4807 if (MO.isImm())
4808 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4809
4810 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4811 "unexpected imm-like operand kind");
4812 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4813 return isLiteralOperandLegal(InstDesc, OpInfo);
4814}
4815
4817 // 2 32-bit inline constants packed into one.
4818 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4819 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4820}
4821
4822bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4823 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4824 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4825 return false;
4826
4827 int Op32 = AMDGPU::getVOPe32(Opcode);
4828 if (Op32 == -1)
4829 return false;
4830
4831 return pseudoToMCOpcode(Op32) != -1;
4832}
4833
4834bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4835 // The src0_modifier operand is present on all instructions
4836 // that have modifiers.
4837
4838 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4839}
4840
4842 AMDGPU::OpName OpName) const {
4843 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4844 return Mods && Mods->getImm();
4845}
4846
4848 return any_of(ModifierOpNames,
4849 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4850}
4851
4853 const MachineRegisterInfo &MRI) const {
4854 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4855 // Can't shrink instruction with three operands.
4856 if (Src2) {
4857 switch (MI.getOpcode()) {
4858 default: return false;
4859
4860 case AMDGPU::V_ADDC_U32_e64:
4861 case AMDGPU::V_SUBB_U32_e64:
4862 case AMDGPU::V_SUBBREV_U32_e64: {
4863 const MachineOperand *Src1
4864 = getNamedOperand(MI, AMDGPU::OpName::src1);
4865 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4866 return false;
4867 // Additional verification is needed for sdst/src2.
4868 return true;
4869 }
4870 case AMDGPU::V_MAC_F16_e64:
4871 case AMDGPU::V_MAC_F32_e64:
4872 case AMDGPU::V_MAC_LEGACY_F32_e64:
4873 case AMDGPU::V_FMAC_F16_e64:
4874 case AMDGPU::V_FMAC_F16_t16_e64:
4875 case AMDGPU::V_FMAC_F16_fake16_e64:
4876 case AMDGPU::V_FMAC_F32_e64:
4877 case AMDGPU::V_FMAC_F64_e64:
4878 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4879 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4880 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4881 return false;
4882 break;
4883
4884 case AMDGPU::V_CNDMASK_B32_e64:
4885 break;
4886 }
4887 }
4888
4889 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4890 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4891 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4892 return false;
4893
4894 // We don't need to check src0, all input types are legal, so just make sure
4895 // src0 isn't using any modifiers.
4896 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4897 return false;
4898
4899 // Can it be shrunk to a valid 32 bit opcode?
4900 if (!hasVALU32BitEncoding(MI.getOpcode()))
4901 return false;
4902
4903 // Check output modifiers
4904 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4905 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4906 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4907 // TODO: Can we avoid checking bound_ctrl/fi here?
4908 // They are only used by permlane*_swap special case.
4909 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4910 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4911}
4912
4913// Set VCC operand with all flags from \p Orig, except for setting it as
4914// implicit.
4916 const MachineOperand &Orig) {
4917
4918 for (MachineOperand &Use : MI.implicit_operands()) {
4919 if (Use.isUse() &&
4920 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4921 Use.setIsUndef(Orig.isUndef());
4922 Use.setIsKill(Orig.isKill());
4923 return;
4924 }
4925 }
4926}
4927
4929 unsigned Op32) const {
4930 MachineBasicBlock *MBB = MI.getParent();
4931
4932 const MCInstrDesc &Op32Desc = get(Op32);
4933 MachineInstrBuilder Inst32 =
4934 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4935 .setMIFlags(MI.getFlags());
4936
4937 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4938 // For VOPC instructions, this is replaced by an implicit def of vcc.
4939
4940 // We assume the defs of the shrunk opcode are in the same order, and the
4941 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4942 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4943 Inst32.add(MI.getOperand(I));
4944
4945 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4946
4947 int Idx = MI.getNumExplicitDefs();
4948 for (const MachineOperand &Use : MI.explicit_uses()) {
4949 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4951 continue;
4952
4953 if (&Use == Src2) {
4954 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4955 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4956 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4957 // of vcc was already added during the initial BuildMI, but we
4958 // 1) may need to change vcc to vcc_lo to preserve the original register
4959 // 2) have to preserve the original flags.
4960 copyFlagsToImplicitVCC(*Inst32, *Src2);
4961 continue;
4962 }
4963 }
4964
4965 Inst32.add(Use);
4966 }
4967
4968 // FIXME: Losing implicit operands
4969 fixImplicitOperands(*Inst32);
4970 return Inst32;
4971}
4972
4974 // Null is free
4975 Register Reg = RegOp.getReg();
4976 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4977 return false;
4978
4979 // SGPRs use the constant bus
4980
4981 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4982 // physical register operands should also count, except for exec.
4983 if (RegOp.isImplicit())
4984 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4985
4986 // SGPRs use the constant bus
4987 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4988 AMDGPU::SReg_64RegClass.contains(Reg);
4989}
4990
4992 const MachineRegisterInfo &MRI) const {
4993 Register Reg = RegOp.getReg();
4994 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4995 : physRegUsesConstantBus(RegOp);
4996}
4997
4999 const MachineOperand &MO,
5000 const MCOperandInfo &OpInfo) const {
5001 // Literal constants use the constant bus.
5002 if (!MO.isReg())
5003 return !isInlineConstant(MO, OpInfo);
5004
5005 Register Reg = MO.getReg();
5006 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5008}
5009
5011 for (const MachineOperand &MO : MI.implicit_operands()) {
5012 // We only care about reads.
5013 if (MO.isDef())
5014 continue;
5015
5016 switch (MO.getReg()) {
5017 case AMDGPU::VCC:
5018 case AMDGPU::VCC_LO:
5019 case AMDGPU::VCC_HI:
5020 case AMDGPU::M0:
5021 case AMDGPU::FLAT_SCR:
5022 return MO.getReg();
5023
5024 default:
5025 break;
5026 }
5027 }
5028
5029 return Register();
5030}
5031
5032static bool shouldReadExec(const MachineInstr &MI) {
5033 if (SIInstrInfo::isVALU(MI)) {
5034 switch (MI.getOpcode()) {
5035 case AMDGPU::V_READLANE_B32:
5036 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5037 case AMDGPU::V_WRITELANE_B32:
5038 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5039 return false;
5040 }
5041
5042 return true;
5043 }
5044
5045 if (MI.isPreISelOpcode() ||
5046 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5049 return false;
5050
5051 return true;
5052}
5053
5054static bool isRegOrFI(const MachineOperand &MO) {
5055 return MO.isReg() || MO.isFI();
5056}
5057
5058static bool isSubRegOf(const SIRegisterInfo &TRI,
5059 const MachineOperand &SuperVec,
5060 const MachineOperand &SubReg) {
5061 if (SubReg.getReg().isPhysical())
5062 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5063
5064 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5065 SubReg.getReg() == SuperVec.getReg();
5066}
5067
5068// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5069bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5070 const MachineRegisterInfo &MRI,
5071 StringRef &ErrInfo) const {
5072 Register DstReg = MI.getOperand(0).getReg();
5073 Register SrcReg = MI.getOperand(1).getReg();
5074 // This is a check for copy from vector register to SGPR
5075 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5076 ErrInfo = "illegal copy from vector register to SGPR";
5077 return false;
5078 }
5079 return true;
5080}
5081
5083 StringRef &ErrInfo) const {
5084 uint16_t Opcode = MI.getOpcode();
5085 const MachineFunction *MF = MI.getMF();
5086 const MachineRegisterInfo &MRI = MF->getRegInfo();
5087
5088 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5089 // Find a better property to recognize the point where instruction selection
5090 // is just done.
5091 // We can only enforce this check after SIFixSGPRCopies pass so that the
5092 // illegal copies are legalized and thereafter we don't expect a pass
5093 // inserting similar copies.
5094 if (!MRI.isSSA() && MI.isCopy())
5095 return verifyCopy(MI, MRI, ErrInfo);
5096
5097 if (SIInstrInfo::isGenericOpcode(Opcode))
5098 return true;
5099
5100 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5101 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5102 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5103 int Src3Idx = -1;
5104 if (Src0Idx == -1) {
5105 // VOPD V_DUAL_* instructions use different operand names.
5106 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5107 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5108 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5109 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5110 }
5111
5112 // Make sure the number of operands is correct.
5113 const MCInstrDesc &Desc = get(Opcode);
5114 if (!Desc.isVariadic() &&
5115 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5116 ErrInfo = "Instruction has wrong number of operands.";
5117 return false;
5118 }
5119
5120 if (MI.isInlineAsm()) {
5121 // Verify register classes for inlineasm constraints.
5122 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5123 I != E; ++I) {
5124 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5125 if (!RC)
5126 continue;
5127
5128 const MachineOperand &Op = MI.getOperand(I);
5129 if (!Op.isReg())
5130 continue;
5131
5132 Register Reg = Op.getReg();
5133 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5134 ErrInfo = "inlineasm operand has incorrect register class.";
5135 return false;
5136 }
5137 }
5138
5139 return true;
5140 }
5141
5142 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5143 ErrInfo = "missing memory operand from image instruction.";
5144 return false;
5145 }
5146
5147 // Make sure the register classes are correct.
5148 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5149 const MachineOperand &MO = MI.getOperand(i);
5150 if (MO.isFPImm()) {
5151 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5152 "all fp values to integers.";
5153 return false;
5154 }
5155
5156 const MCOperandInfo &OpInfo = Desc.operands()[i];
5157 int16_t RegClass = getOpRegClassID(OpInfo);
5158
5159 switch (OpInfo.OperandType) {
5161 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5162 ErrInfo = "Illegal immediate value for operand.";
5163 return false;
5164 }
5165 break;
5179 break;
5181 break;
5182 break;
5196 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5197 ErrInfo = "Illegal immediate value for operand.";
5198 return false;
5199 }
5200 break;
5201 }
5203 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5204 ErrInfo = "Expected inline constant for operand.";
5205 return false;
5206 }
5207 break;
5211 break;
5216 // Check if this operand is an immediate.
5217 // FrameIndex operands will be replaced by immediates, so they are
5218 // allowed.
5219 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5220 ErrInfo = "Expected immediate, but got non-immediate";
5221 return false;
5222 }
5223 break;
5227 break;
5228 default:
5229 if (OpInfo.isGenericType())
5230 continue;
5231 break;
5232 }
5233
5234 if (!MO.isReg())
5235 continue;
5236 Register Reg = MO.getReg();
5237 if (!Reg)
5238 continue;
5239
5240 // FIXME: Ideally we would have separate instruction definitions with the
5241 // aligned register constraint.
5242 // FIXME: We do not verify inline asm operands, but custom inline asm
5243 // verification is broken anyway
5244 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5245 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5246 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5247 if (const TargetRegisterClass *SubRC =
5248 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5249 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5250 if (RC)
5251 RC = SubRC;
5252 }
5253 }
5254
5255 // Check that this is the aligned version of the class.
5256 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5257 ErrInfo = "Subtarget requires even aligned vector registers";
5258 return false;
5259 }
5260 }
5261
5262 if (RegClass != -1) {
5263 if (Reg.isVirtual())
5264 continue;
5265
5266 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5267 if (!RC->contains(Reg)) {
5268 ErrInfo = "Operand has incorrect register class.";
5269 return false;
5270 }
5271 }
5272 }
5273
5274 // Verify SDWA
5275 if (isSDWA(MI)) {
5276 if (!ST.hasSDWA()) {
5277 ErrInfo = "SDWA is not supported on this target";
5278 return false;
5279 }
5280
5281 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5282 AMDGPU::OpName::dst_sel}) {
5283 const MachineOperand *MO = getNamedOperand(MI, Op);
5284 if (!MO)
5285 continue;
5286 int64_t Imm = MO->getImm();
5287 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5288 ErrInfo = "Invalid SDWA selection";
5289 return false;
5290 }
5291 }
5292
5293 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5294
5295 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5296 if (OpIdx == -1)
5297 continue;
5298 const MachineOperand &MO = MI.getOperand(OpIdx);
5299
5300 if (!ST.hasSDWAScalar()) {
5301 // Only VGPRS on VI
5302 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5303 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5304 return false;
5305 }
5306 } else {
5307 // No immediates on GFX9
5308 if (!MO.isReg()) {
5309 ErrInfo =
5310 "Only reg allowed as operands in SDWA instructions on GFX9+";
5311 return false;
5312 }
5313 }
5314 }
5315
5316 if (!ST.hasSDWAOmod()) {
5317 // No omod allowed on VI
5318 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5319 if (OMod != nullptr &&
5320 (!OMod->isImm() || OMod->getImm() != 0)) {
5321 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5322 return false;
5323 }
5324 }
5325
5326 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5327 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5328 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5329 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5330 const MachineOperand *Src0ModsMO =
5331 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5332 unsigned Mods = Src0ModsMO->getImm();
5333 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5334 Mods & SISrcMods::SEXT) {
5335 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5336 return false;
5337 }
5338 }
5339
5340 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5341 if (isVOPC(BasicOpcode)) {
5342 if (!ST.hasSDWASdst() && DstIdx != -1) {
5343 // Only vcc allowed as dst on VI for VOPC
5344 const MachineOperand &Dst = MI.getOperand(DstIdx);
5345 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5346 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5347 return false;
5348 }
5349 } else if (!ST.hasSDWAOutModsVOPC()) {
5350 // No clamp allowed on GFX9 for VOPC
5351 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5352 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5353 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5354 return false;
5355 }
5356
5357 // No omod allowed on GFX9 for VOPC
5358 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5359 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5360 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5361 return false;
5362 }
5363 }
5364 }
5365
5366 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5367 if (DstUnused && DstUnused->isImm() &&
5368 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5369 const MachineOperand &Dst = MI.getOperand(DstIdx);
5370 if (!Dst.isReg() || !Dst.isTied()) {
5371 ErrInfo = "Dst register should have tied register";
5372 return false;
5373 }
5374
5375 const MachineOperand &TiedMO =
5376 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5377 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5378 ErrInfo =
5379 "Dst register should be tied to implicit use of preserved register";
5380 return false;
5381 }
5382 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5383 ErrInfo = "Dst register should use same physical register as preserved";
5384 return false;
5385 }
5386 }
5387 }
5388
5389 // Verify MIMG / VIMAGE / VSAMPLE
5390 if (isImage(Opcode) && !MI.mayStore()) {
5391 // Ensure that the return type used is large enough for all the options
5392 // being used TFE/LWE require an extra result register.
5393 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5394 if (DMask) {
5395 uint64_t DMaskImm = DMask->getImm();
5396 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5397 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5398 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5399 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5400
5401 // Adjust for packed 16 bit values
5402 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5403 RegCount = divideCeil(RegCount, 2);
5404
5405 // Adjust if using LWE or TFE
5406 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5407 RegCount += 1;
5408
5409 const uint32_t DstIdx =
5410 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5411 const MachineOperand &Dst = MI.getOperand(DstIdx);
5412 if (Dst.isReg()) {
5413 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5414 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5415 if (RegCount > DstSize) {
5416 ErrInfo = "Image instruction returns too many registers for dst "
5417 "register class";
5418 return false;
5419 }
5420 }
5421 }
5422 }
5423
5424 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5425 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5426 unsigned ConstantBusCount = 0;
5427 bool UsesLiteral = false;
5428 const MachineOperand *LiteralVal = nullptr;
5429
5430 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5431 if (ImmIdx != -1) {
5432 ++ConstantBusCount;
5433 UsesLiteral = true;
5434 LiteralVal = &MI.getOperand(ImmIdx);
5435 }
5436
5437 SmallVector<Register, 2> SGPRsUsed;
5438 Register SGPRUsed;
5439
5440 // Only look at the true operands. Only a real operand can use the constant
5441 // bus, and we don't want to check pseudo-operands like the source modifier
5442 // flags.
5443 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5444 if (OpIdx == -1)
5445 continue;
5446 const MachineOperand &MO = MI.getOperand(OpIdx);
5447 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5448 if (MO.isReg()) {
5449 SGPRUsed = MO.getReg();
5450 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5451 ++ConstantBusCount;
5452 SGPRsUsed.push_back(SGPRUsed);
5453 }
5454 } else if (!MO.isFI()) { // Treat FI like a register.
5455 if (!UsesLiteral) {
5456 ++ConstantBusCount;
5457 UsesLiteral = true;
5458 LiteralVal = &MO;
5459 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5460 assert(isVOP2(MI) || isVOP3(MI));
5461 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5462 return false;
5463 }
5464 }
5465 }
5466 }
5467
5468 SGPRUsed = findImplicitSGPRRead(MI);
5469 if (SGPRUsed) {
5470 // Implicit uses may safely overlap true operands
5471 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5472 return !RI.regsOverlap(SGPRUsed, SGPR);
5473 })) {
5474 ++ConstantBusCount;
5475 SGPRsUsed.push_back(SGPRUsed);
5476 }
5477 }
5478
5479 // v_writelane_b32 is an exception from constant bus restriction:
5480 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5481 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5482 Opcode != AMDGPU::V_WRITELANE_B32) {
5483 ErrInfo = "VOP* instruction violates constant bus restriction";
5484 return false;
5485 }
5486
5487 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5488 ErrInfo = "VOP3 instruction uses literal";
5489 return false;
5490 }
5491 }
5492
5493 // Special case for writelane - this can break the multiple constant bus rule,
5494 // but still can't use more than one SGPR register
5495 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5496 unsigned SGPRCount = 0;
5497 Register SGPRUsed;
5498
5499 for (int OpIdx : {Src0Idx, Src1Idx}) {
5500 if (OpIdx == -1)
5501 break;
5502
5503 const MachineOperand &MO = MI.getOperand(OpIdx);
5504
5505 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5506 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5507 if (MO.getReg() != SGPRUsed)
5508 ++SGPRCount;
5509 SGPRUsed = MO.getReg();
5510 }
5511 }
5512 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5513 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5514 return false;
5515 }
5516 }
5517 }
5518
5519 // Verify misc. restrictions on specific instructions.
5520 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5521 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5522 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5523 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5524 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5525 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5526 if (!compareMachineOp(Src0, Src1) &&
5527 !compareMachineOp(Src0, Src2)) {
5528 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5529 return false;
5530 }
5531 }
5532 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5533 SISrcMods::ABS) ||
5534 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5535 SISrcMods::ABS) ||
5536 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5537 SISrcMods::ABS)) {
5538 ErrInfo = "ABS not allowed in VOP3B instructions";
5539 return false;
5540 }
5541 }
5542
5543 if (isSOP2(MI) || isSOPC(MI)) {
5544 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5545 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5546
5547 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5548 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5549 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5550 !Src0.isIdenticalTo(Src1)) {
5551 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5552 return false;
5553 }
5554 }
5555
5556 if (isSOPK(MI)) {
5557 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5558 if (Desc.isBranch()) {
5559 if (!Op->isMBB()) {
5560 ErrInfo = "invalid branch target for SOPK instruction";
5561 return false;
5562 }
5563 } else {
5564 uint64_t Imm = Op->getImm();
5565 if (sopkIsZext(Opcode)) {
5566 if (!isUInt<16>(Imm)) {
5567 ErrInfo = "invalid immediate for SOPK instruction";
5568 return false;
5569 }
5570 } else {
5571 if (!isInt<16>(Imm)) {
5572 ErrInfo = "invalid immediate for SOPK instruction";
5573 return false;
5574 }
5575 }
5576 }
5577 }
5578
5579 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5580 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5581 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5582 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5583 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5584 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5585
5586 const unsigned StaticNumOps =
5587 Desc.getNumOperands() + Desc.implicit_uses().size();
5588 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5589
5590 // Require additional implicit operands. This allows a fixup done by the
5591 // post RA scheduler where the main implicit operand is killed and
5592 // implicit-defs are added for sub-registers that remain live after this
5593 // instruction.
5594 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5595 ErrInfo = "missing implicit register operands";
5596 return false;
5597 }
5598
5599 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5600 if (IsDst) {
5601 if (!Dst->isUse()) {
5602 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5603 return false;
5604 }
5605
5606 unsigned UseOpIdx;
5607 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5608 UseOpIdx != StaticNumOps + 1) {
5609 ErrInfo = "movrel implicit operands should be tied";
5610 return false;
5611 }
5612 }
5613
5614 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5615 const MachineOperand &ImpUse
5616 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5617 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5618 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5619 ErrInfo = "src0 should be subreg of implicit vector use";
5620 return false;
5621 }
5622 }
5623
5624 // Make sure we aren't losing exec uses in the td files. This mostly requires
5625 // being careful when using let Uses to try to add other use registers.
5626 if (shouldReadExec(MI)) {
5627 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5628 ErrInfo = "VALU instruction does not implicitly read exec mask";
5629 return false;
5630 }
5631 }
5632
5633 if (isSMRD(MI)) {
5634 if (MI.mayStore() &&
5635 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5636 // The register offset form of scalar stores may only use m0 as the
5637 // soffset register.
5638 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5639 if (Soff && Soff->getReg() != AMDGPU::M0) {
5640 ErrInfo = "scalar stores must use m0 as offset register";
5641 return false;
5642 }
5643 }
5644 }
5645
5646 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5647 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5648 if (Offset->getImm() != 0) {
5649 ErrInfo = "subtarget does not support offsets in flat instructions";
5650 return false;
5651 }
5652 }
5653
5654 if (isDS(MI) && !ST.hasGDS()) {
5655 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5656 if (GDSOp && GDSOp->getImm() != 0) {
5657 ErrInfo = "GDS is not supported on this subtarget";
5658 return false;
5659 }
5660 }
5661
5662 if (isImage(MI)) {
5663 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5664 if (DimOp) {
5665 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5666 AMDGPU::OpName::vaddr0);
5667 AMDGPU::OpName RSrcOpName =
5668 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5669 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5670 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5671 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5672 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5673 const AMDGPU::MIMGDimInfo *Dim =
5675
5676 if (!Dim) {
5677 ErrInfo = "dim is out of range";
5678 return false;
5679 }
5680
5681 bool IsA16 = false;
5682 if (ST.hasR128A16()) {
5683 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5684 IsA16 = R128A16->getImm() != 0;
5685 } else if (ST.hasA16()) {
5686 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5687 IsA16 = A16->getImm() != 0;
5688 }
5689
5690 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5691
5692 unsigned AddrWords =
5693 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5694
5695 unsigned VAddrWords;
5696 if (IsNSA) {
5697 VAddrWords = RsrcIdx - VAddr0Idx;
5698 if (ST.hasPartialNSAEncoding() &&
5699 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5700 unsigned LastVAddrIdx = RsrcIdx - 1;
5701 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5702 }
5703 } else {
5704 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5705 if (AddrWords > 12)
5706 AddrWords = 16;
5707 }
5708
5709 if (VAddrWords != AddrWords) {
5710 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5711 << " but got " << VAddrWords << "\n");
5712 ErrInfo = "bad vaddr size";
5713 return false;
5714 }
5715 }
5716 }
5717
5718 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5719 if (DppCt) {
5720 using namespace AMDGPU::DPP;
5721
5722 unsigned DC = DppCt->getImm();
5723 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5724 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5725 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5726 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5727 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5728 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5729 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5730 ErrInfo = "Invalid dpp_ctrl value";
5731 return false;
5732 }
5733 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5734 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5735 ErrInfo = "Invalid dpp_ctrl value: "
5736 "wavefront shifts are not supported on GFX10+";
5737 return false;
5738 }
5739 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5740 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5741 ErrInfo = "Invalid dpp_ctrl value: "
5742 "broadcasts are not supported on GFX10+";
5743 return false;
5744 }
5745 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5746 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5747 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5748 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5749 !ST.hasGFX90AInsts()) {
5750 ErrInfo = "Invalid dpp_ctrl value: "
5751 "row_newbroadcast/row_share is not supported before "
5752 "GFX90A/GFX10";
5753 return false;
5754 }
5755 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5756 ErrInfo = "Invalid dpp_ctrl value: "
5757 "row_share and row_xmask are not supported before GFX10";
5758 return false;
5759 }
5760 }
5761
5762 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5764 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5765 ErrInfo = "Invalid dpp_ctrl value: "
5766 "DP ALU dpp only support row_newbcast";
5767 return false;
5768 }
5769 }
5770
5771 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5772 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5773 AMDGPU::OpName DataName =
5774 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5775 const MachineOperand *Data = getNamedOperand(MI, DataName);
5776 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5777 if (Data && !Data->isReg())
5778 Data = nullptr;
5779
5780 if (ST.hasGFX90AInsts()) {
5781 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5782 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5783 ErrInfo = "Invalid register class: "
5784 "vdata and vdst should be both VGPR or AGPR";
5785 return false;
5786 }
5787 if (Data && Data2 &&
5788 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5789 ErrInfo = "Invalid register class: "
5790 "both data operands should be VGPR or AGPR";
5791 return false;
5792 }
5793 } else {
5794 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5795 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5796 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5797 ErrInfo = "Invalid register class: "
5798 "agpr loads and stores not supported on this GPU";
5799 return false;
5800 }
5801 }
5802 }
5803
5804 if (ST.needsAlignedVGPRs()) {
5805 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5807 if (!Op)
5808 return true;
5809 Register Reg = Op->getReg();
5810 if (Reg.isPhysical())
5811 return !(RI.getHWRegIndex(Reg) & 1);
5812 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5813 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5814 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5815 };
5816
5817 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5818 Opcode == AMDGPU::DS_GWS_BARRIER) {
5819
5820 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5821 ErrInfo = "Subtarget requires even aligned vector registers "
5822 "for DS_GWS instructions";
5823 return false;
5824 }
5825 }
5826
5827 if (isMIMG(MI)) {
5828 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5829 ErrInfo = "Subtarget requires even aligned vector registers "
5830 "for vaddr operand of image instructions";
5831 return false;
5832 }
5833 }
5834 }
5835
5836 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5837 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5838 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5839 ErrInfo = "Invalid register class: "
5840 "v_accvgpr_write with an SGPR is not supported on this GPU";
5841 return false;
5842 }
5843 }
5844
5845 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5846 const MachineOperand &SrcOp = MI.getOperand(1);
5847 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5848 ErrInfo = "pseudo expects only physical SGPRs";
5849 return false;
5850 }
5851 }
5852
5853 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5854 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5855 if (!ST.hasScaleOffset()) {
5856 ErrInfo = "Subtarget does not support offset scaling";
5857 return false;
5858 }
5859 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5860 ErrInfo = "Instruction does not support offset scaling";
5861 return false;
5862 }
5863 }
5864 }
5865
5866 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5867 // information.
5868 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5869 for (unsigned I = 0; I < 3; ++I) {
5871 return false;
5872 }
5873 }
5874
5875 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5876 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5877 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5878 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5879 &AMDGPU::SReg_64RegClass) ||
5880 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5881 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5882 return false;
5883 }
5884 }
5885
5886 return true;
5887}
5888
5889// It is more readable to list mapped opcodes on the same line.
5890// clang-format off
5891
5893 switch (MI.getOpcode()) {
5894 default: return AMDGPU::INSTRUCTION_LIST_END;
5895 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5896 case AMDGPU::COPY: return AMDGPU::COPY;
5897 case AMDGPU::PHI: return AMDGPU::PHI;
5898 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5899 case AMDGPU::WQM: return AMDGPU::WQM;
5900 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5901 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5902 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5903 case AMDGPU::S_MOV_B32: {
5904 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5905 return MI.getOperand(1).isReg() ||
5906 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5907 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5908 }
5909 case AMDGPU::S_ADD_I32:
5910 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5911 case AMDGPU::S_ADDC_U32:
5912 return AMDGPU::V_ADDC_U32_e32;
5913 case AMDGPU::S_SUB_I32:
5914 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5915 // FIXME: These are not consistently handled, and selected when the carry is
5916 // used.
5917 case AMDGPU::S_ADD_U32:
5918 return AMDGPU::V_ADD_CO_U32_e32;
5919 case AMDGPU::S_SUB_U32:
5920 return AMDGPU::V_SUB_CO_U32_e32;
5921 case AMDGPU::S_ADD_U64_PSEUDO:
5922 return AMDGPU::V_ADD_U64_PSEUDO;
5923 case AMDGPU::S_SUB_U64_PSEUDO:
5924 return AMDGPU::V_SUB_U64_PSEUDO;
5925 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5926 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5927 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5928 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5929 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5930 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5931 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5932 case AMDGPU::S_XNOR_B32:
5933 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5934 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5935 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5936 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5937 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5938 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5939 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5940 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5941 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5942 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5943 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5944 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5945 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5946 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5947 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5948 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5949 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5950 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5951 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5952 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5953 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5954 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5955 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5956 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5957 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5958 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5959 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5960 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5961 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5962 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5963 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5964 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5965 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5966 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5967 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5968 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5969 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5970 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5971 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5972 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5973 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5974 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5975 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5976 case AMDGPU::S_CVT_F32_F16:
5977 case AMDGPU::S_CVT_HI_F32_F16:
5978 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5979 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5980 case AMDGPU::S_CVT_F16_F32:
5981 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5982 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5983 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5984 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5985 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5986 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5987 case AMDGPU::S_CEIL_F16:
5988 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5989 : AMDGPU::V_CEIL_F16_fake16_e64;
5990 case AMDGPU::S_FLOOR_F16:
5991 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5992 : AMDGPU::V_FLOOR_F16_fake16_e64;
5993 case AMDGPU::S_TRUNC_F16:
5994 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5995 : AMDGPU::V_TRUNC_F16_fake16_e64;
5996 case AMDGPU::S_RNDNE_F16:
5997 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5998 : AMDGPU::V_RNDNE_F16_fake16_e64;
5999 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6000 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6001 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6002 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6003 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6004 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6005 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6006 case AMDGPU::S_ADD_F16:
6007 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6008 : AMDGPU::V_ADD_F16_fake16_e64;
6009 case AMDGPU::S_SUB_F16:
6010 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6011 : AMDGPU::V_SUB_F16_fake16_e64;
6012 case AMDGPU::S_MIN_F16:
6013 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6014 : AMDGPU::V_MIN_F16_fake16_e64;
6015 case AMDGPU::S_MAX_F16:
6016 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6017 : AMDGPU::V_MAX_F16_fake16_e64;
6018 case AMDGPU::S_MINIMUM_F16:
6019 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6020 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6021 case AMDGPU::S_MAXIMUM_F16:
6022 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6023 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6024 case AMDGPU::S_MUL_F16:
6025 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6026 : AMDGPU::V_MUL_F16_fake16_e64;
6027 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6028 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6029 case AMDGPU::S_FMAC_F16:
6030 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6031 : AMDGPU::V_FMAC_F16_fake16_e64;
6032 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6033 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6034 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6035 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6036 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6037 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6038 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6039 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6040 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6041 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6042 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6043 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6044 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6045 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6046 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6047 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6048 case AMDGPU::S_CMP_LT_F16:
6049 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6050 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6051 case AMDGPU::S_CMP_EQ_F16:
6052 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6053 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6054 case AMDGPU::S_CMP_LE_F16:
6055 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6056 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6057 case AMDGPU::S_CMP_GT_F16:
6058 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6059 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6060 case AMDGPU::S_CMP_LG_F16:
6061 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6062 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6063 case AMDGPU::S_CMP_GE_F16:
6064 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6065 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6066 case AMDGPU::S_CMP_O_F16:
6067 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6068 : AMDGPU::V_CMP_O_F16_fake16_e64;
6069 case AMDGPU::S_CMP_U_F16:
6070 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6071 : AMDGPU::V_CMP_U_F16_fake16_e64;
6072 case AMDGPU::S_CMP_NGE_F16:
6073 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6074 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6075 case AMDGPU::S_CMP_NLG_F16:
6076 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6077 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6078 case AMDGPU::S_CMP_NGT_F16:
6079 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6080 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6081 case AMDGPU::S_CMP_NLE_F16:
6082 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6083 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6084 case AMDGPU::S_CMP_NEQ_F16:
6085 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6086 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6087 case AMDGPU::S_CMP_NLT_F16:
6088 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6089 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6090 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6091 case AMDGPU::V_S_EXP_F16_e64:
6092 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6093 : AMDGPU::V_EXP_F16_fake16_e64;
6094 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6095 case AMDGPU::V_S_LOG_F16_e64:
6096 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6097 : AMDGPU::V_LOG_F16_fake16_e64;
6098 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6099 case AMDGPU::V_S_RCP_F16_e64:
6100 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6101 : AMDGPU::V_RCP_F16_fake16_e64;
6102 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6103 case AMDGPU::V_S_RSQ_F16_e64:
6104 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6105 : AMDGPU::V_RSQ_F16_fake16_e64;
6106 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6107 case AMDGPU::V_S_SQRT_F16_e64:
6108 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6109 : AMDGPU::V_SQRT_F16_fake16_e64;
6110 }
6112 "Unexpected scalar opcode without corresponding vector one!");
6113}
6114
6115// clang-format on
6116
6120 const DebugLoc &DL, Register Reg,
6121 bool IsSCCLive,
6122 SlotIndexes *Indexes) const {
6123 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6124 const SIInstrInfo *TII = ST.getInstrInfo();
6126 if (IsSCCLive) {
6127 // Insert two move instructions, one to save the original value of EXEC and
6128 // the other to turn on all bits in EXEC. This is required as we can't use
6129 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6130 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6132 auto FlipExecMI =
6133 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6134 if (Indexes) {
6135 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6136 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6137 }
6138 } else {
6139 auto SaveExec =
6140 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6141 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6142 if (Indexes)
6143 Indexes->insertMachineInstrInMaps(*SaveExec);
6144 }
6145}
6146
6149 const DebugLoc &DL, Register Reg,
6150 SlotIndexes *Indexes) const {
6152 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6153 .addReg(Reg, RegState::Kill);
6154 if (Indexes)
6155 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6156}
6157
6161 "Not a whole wave func");
6162 MachineBasicBlock &MBB = *MF.begin();
6163 for (MachineInstr &MI : MBB)
6164 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6165 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6166 return &MI;
6167
6168 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6169}
6170
6172 unsigned OpNo) const {
6173 const MCInstrDesc &Desc = get(MI.getOpcode());
6174 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6175 Desc.operands()[OpNo].RegClass == -1) {
6176 Register Reg = MI.getOperand(OpNo).getReg();
6177
6178 if (Reg.isVirtual()) {
6179 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6180 return MRI.getRegClass(Reg);
6181 }
6182 return RI.getPhysRegBaseClass(Reg);
6183 }
6184
6185 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6186 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6187}
6188
6191 MachineBasicBlock *MBB = MI.getParent();
6192 MachineOperand &MO = MI.getOperand(OpIdx);
6193 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6194 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6195 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6196 unsigned Size = RI.getRegSizeInBits(*RC);
6197 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6198 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6199 : AMDGPU::V_MOV_B32_e32;
6200 if (MO.isReg())
6201 Opcode = AMDGPU::COPY;
6202 else if (RI.isSGPRClass(RC))
6203 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6204
6205 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6206 Register Reg = MRI.createVirtualRegister(VRC);
6207 DebugLoc DL = MBB->findDebugLoc(I);
6208 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6209 MO.ChangeToRegister(Reg, false);
6210}
6211
6214 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6215 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6216 if (!SuperReg.getReg().isVirtual())
6217 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6218
6219 MachineBasicBlock *MBB = MI->getParent();
6220 const DebugLoc &DL = MI->getDebugLoc();
6221 Register SubReg = MRI.createVirtualRegister(SubRC);
6222
6223 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6224 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6225 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6226 return SubReg;
6227}
6228
6231 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6232 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6233 if (Op.isImm()) {
6234 if (SubIdx == AMDGPU::sub0)
6235 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6236 if (SubIdx == AMDGPU::sub1)
6237 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6238
6239 llvm_unreachable("Unhandled register index for immediate");
6240 }
6241
6242 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6243 SubIdx, SubRC);
6244 return MachineOperand::CreateReg(SubReg, false);
6245}
6246
6247// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6248void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6249 assert(Inst.getNumExplicitOperands() == 3);
6250 MachineOperand Op1 = Inst.getOperand(1);
6251 Inst.removeOperand(1);
6252 Inst.addOperand(Op1);
6253}
6254
6256 const MCOperandInfo &OpInfo,
6257 const MachineOperand &MO) const {
6258 if (!MO.isReg())
6259 return false;
6260
6261 Register Reg = MO.getReg();
6262
6263 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6264 if (Reg.isPhysical())
6265 return DRC->contains(Reg);
6266
6267 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6268
6269 if (MO.getSubReg()) {
6270 const MachineFunction *MF = MO.getParent()->getMF();
6271 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6272 if (!SuperRC)
6273 return false;
6274 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6275 }
6276
6277 return RI.getCommonSubClass(DRC, RC) != nullptr;
6278}
6279
6281 const MachineOperand &MO) const {
6282 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6283 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6284 unsigned Opc = MI.getOpcode();
6285
6286 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6287 // information.
6288 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6289 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6290 constexpr AMDGPU::OpName OpNames[] = {
6291 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6292
6293 for (auto [I, OpName] : enumerate(OpNames)) {
6294 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6295 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6297 return false;
6298 }
6299 }
6300
6301 if (!isLegalRegOperand(MRI, OpInfo, MO))
6302 return false;
6303
6304 // check Accumulate GPR operand
6305 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6306 if (IsAGPR && !ST.hasMAIInsts())
6307 return false;
6308 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6309 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6310 return false;
6311 // Atomics should have both vdst and vdata either vgpr or agpr.
6312 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6313 const int DataIdx = AMDGPU::getNamedOperandIdx(
6314 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6315 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6316 MI.getOperand(DataIdx).isReg() &&
6317 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6318 return false;
6319 if ((int)OpIdx == DataIdx) {
6320 if (VDstIdx != -1 &&
6321 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6322 return false;
6323 // DS instructions with 2 src operands also must have tied RC.
6324 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6325 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6326 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6327 return false;
6328 }
6329
6330 // Check V_ACCVGPR_WRITE_B32_e64
6331 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6332 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6333 RI.isSGPRReg(MRI, MO.getReg()))
6334 return false;
6335
6336 if (ST.hasFlatScratchHiInB64InstHazard() &&
6337 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6338 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6339 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6340 64)
6341 return false;
6342 }
6343 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6344 return false;
6345 }
6346
6347 return true;
6348}
6349
6351 const MCOperandInfo &OpInfo,
6352 const MachineOperand &MO) const {
6353 if (MO.isReg())
6354 return isLegalRegOperand(MRI, OpInfo, MO);
6355
6356 // Handle non-register types that are treated like immediates.
6357 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6358 return true;
6359}
6360
6362 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6363 const MachineOperand *MO) const {
6364 constexpr unsigned NumOps = 3;
6365 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6366 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6367 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6368 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6369
6370 assert(SrcN < NumOps);
6371
6372 if (!MO) {
6373 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6374 if (SrcIdx == -1)
6375 return true;
6376 MO = &MI.getOperand(SrcIdx);
6377 }
6378
6379 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6380 return true;
6381
6382 int ModsIdx =
6383 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6384 if (ModsIdx == -1)
6385 return true;
6386
6387 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6388 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6389 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6390
6391 return !OpSel && !OpSelHi;
6392}
6393
6395 const MachineOperand *MO) const {
6396 const MachineFunction &MF = *MI.getMF();
6397 const MachineRegisterInfo &MRI = MF.getRegInfo();
6398 const MCInstrDesc &InstDesc = MI.getDesc();
6399 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6400 int64_t RegClass = getOpRegClassID(OpInfo);
6401 const TargetRegisterClass *DefinedRC =
6402 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6403 if (!MO)
6404 MO = &MI.getOperand(OpIdx);
6405
6406 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6407
6408 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6409 const MachineOperand *UsedLiteral = nullptr;
6410
6411 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6412 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6413
6414 // TODO: Be more permissive with frame indexes.
6415 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6416 if (!LiteralLimit--)
6417 return false;
6418
6419 UsedLiteral = MO;
6420 }
6421
6423 if (MO->isReg())
6424 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6425
6426 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6427 if (i == OpIdx)
6428 continue;
6429 const MachineOperand &Op = MI.getOperand(i);
6430 if (Op.isReg()) {
6431 if (Op.isUse()) {
6432 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6433 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6434 if (--ConstantBusLimit <= 0)
6435 return false;
6436 }
6437 }
6438 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6439 !isInlineConstant(Op, InstDesc.operands()[i])) {
6440 // The same literal may be used multiple times.
6441 if (!UsedLiteral)
6442 UsedLiteral = &Op;
6443 else if (UsedLiteral->isIdenticalTo(Op))
6444 continue;
6445
6446 if (!LiteralLimit--)
6447 return false;
6448 if (--ConstantBusLimit <= 0)
6449 return false;
6450 }
6451 }
6452 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6453 // There can be at most one literal operand, but it can be repeated.
6454 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6455 if (i == OpIdx)
6456 continue;
6457 const MachineOperand &Op = MI.getOperand(i);
6458 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6459 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6460 !Op.isIdenticalTo(*MO))
6461 return false;
6462
6463 // Do not fold a non-inlineable and non-register operand into an
6464 // instruction that already has a frame index. The frame index handling
6465 // code could not handle well when a frame index co-exists with another
6466 // non-register operand, unless that operand is an inlineable immediate.
6467 if (Op.isFI())
6468 return false;
6469 }
6470 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6471 isF16PseudoScalarTrans(MI.getOpcode())) {
6472 return false;
6473 }
6474
6475 if (MO->isReg()) {
6476 if (!DefinedRC)
6477 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6478 return isLegalRegOperand(MI, OpIdx, *MO);
6479 }
6480
6481 if (MO->isImm()) {
6482 uint64_t Imm = MO->getImm();
6483 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6484 bool Is64BitOp = Is64BitFPOp ||
6485 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6486 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6487 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6488 if (Is64BitOp &&
6489 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6490 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6491 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6492 return false;
6493
6494 // FIXME: We can use sign extended 64-bit literals, but only for signed
6495 // operands. At the moment we do not know if an operand is signed.
6496 // Such operand will be encoded as its low 32 bits and then either
6497 // correctly sign extended or incorrectly zero extended by HW.
6498 // If 64-bit literals are supported and the literal will be encoded
6499 // as full 64 bit we still can use it.
6500 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6501 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6502 return false;
6503 }
6504 }
6505
6506 // Handle non-register types that are treated like immediates.
6507 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6508
6509 if (!DefinedRC) {
6510 // This operand expects an immediate.
6511 return true;
6512 }
6513
6514 return isImmOperandLegal(MI, OpIdx, *MO);
6515}
6516
6518 bool IsGFX950Only = ST.hasGFX950Insts();
6519 bool IsGFX940Only = ST.hasGFX940Insts();
6520
6521 if (!IsGFX950Only && !IsGFX940Only)
6522 return false;
6523
6524 if (!isVALU(MI))
6525 return false;
6526
6527 // V_COS, V_EXP, V_RCP, etc.
6528 if (isTRANS(MI))
6529 return true;
6530
6531 // DOT2, DOT2C, DOT4, etc.
6532 if (isDOT(MI))
6533 return true;
6534
6535 // MFMA, SMFMA
6536 if (isMFMA(MI))
6537 return true;
6538
6539 unsigned Opcode = MI.getOpcode();
6540 switch (Opcode) {
6541 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6542 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6543 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6544 case AMDGPU::V_MQSAD_U32_U8_e64:
6545 case AMDGPU::V_PK_ADD_F16:
6546 case AMDGPU::V_PK_ADD_F32:
6547 case AMDGPU::V_PK_ADD_I16:
6548 case AMDGPU::V_PK_ADD_U16:
6549 case AMDGPU::V_PK_ASHRREV_I16:
6550 case AMDGPU::V_PK_FMA_F16:
6551 case AMDGPU::V_PK_FMA_F32:
6552 case AMDGPU::V_PK_FMAC_F16_e32:
6553 case AMDGPU::V_PK_FMAC_F16_e64:
6554 case AMDGPU::V_PK_LSHLREV_B16:
6555 case AMDGPU::V_PK_LSHRREV_B16:
6556 case AMDGPU::V_PK_MAD_I16:
6557 case AMDGPU::V_PK_MAD_U16:
6558 case AMDGPU::V_PK_MAX_F16:
6559 case AMDGPU::V_PK_MAX_I16:
6560 case AMDGPU::V_PK_MAX_U16:
6561 case AMDGPU::V_PK_MIN_F16:
6562 case AMDGPU::V_PK_MIN_I16:
6563 case AMDGPU::V_PK_MIN_U16:
6564 case AMDGPU::V_PK_MOV_B32:
6565 case AMDGPU::V_PK_MUL_F16:
6566 case AMDGPU::V_PK_MUL_F32:
6567 case AMDGPU::V_PK_MUL_LO_U16:
6568 case AMDGPU::V_PK_SUB_I16:
6569 case AMDGPU::V_PK_SUB_U16:
6570 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6571 return true;
6572 default:
6573 return false;
6574 }
6575}
6576
6578 MachineInstr &MI) const {
6579 unsigned Opc = MI.getOpcode();
6580 const MCInstrDesc &InstrDesc = get(Opc);
6581
6582 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6583 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6584
6585 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6586 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6587
6588 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6589 // we need to only have one constant bus use before GFX10.
6590 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6591 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6592 RI.isSGPRReg(MRI, Src0.getReg()))
6593 legalizeOpWithMove(MI, Src0Idx);
6594
6595 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6596 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6597 // src0/src1 with V_READFIRSTLANE.
6598 if (Opc == AMDGPU::V_WRITELANE_B32) {
6599 const DebugLoc &DL = MI.getDebugLoc();
6600 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6601 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6602 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6603 .add(Src0);
6604 Src0.ChangeToRegister(Reg, false);
6605 }
6606 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6607 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6608 const DebugLoc &DL = MI.getDebugLoc();
6609 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6610 .add(Src1);
6611 Src1.ChangeToRegister(Reg, false);
6612 }
6613 return;
6614 }
6615
6616 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6617 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6618 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6619 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6620 legalizeOpWithMove(MI, Src2Idx);
6621 }
6622
6623 // VOP2 src0 instructions support all operand types, so we don't need to check
6624 // their legality. If src1 is already legal, we don't need to do anything.
6625 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6626 return;
6627
6628 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6629 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6630 // select is uniform.
6631 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6632 RI.isVGPR(MRI, Src1.getReg())) {
6633 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6634 const DebugLoc &DL = MI.getDebugLoc();
6635 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6636 .add(Src1);
6637 Src1.ChangeToRegister(Reg, false);
6638 return;
6639 }
6640
6641 // We do not use commuteInstruction here because it is too aggressive and will
6642 // commute if it is possible. We only want to commute here if it improves
6643 // legality. This can be called a fairly large number of times so don't waste
6644 // compile time pointlessly swapping and checking legality again.
6645 if (HasImplicitSGPR || !MI.isCommutable()) {
6646 legalizeOpWithMove(MI, Src1Idx);
6647 return;
6648 }
6649
6650 // If src0 can be used as src1, commuting will make the operands legal.
6651 // Otherwise we have to give up and insert a move.
6652 //
6653 // TODO: Other immediate-like operand kinds could be commuted if there was a
6654 // MachineOperand::ChangeTo* for them.
6655 if ((!Src1.isImm() && !Src1.isReg()) ||
6656 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6657 legalizeOpWithMove(MI, Src1Idx);
6658 return;
6659 }
6660
6661 int CommutedOpc = commuteOpcode(MI);
6662 if (CommutedOpc == -1) {
6663 legalizeOpWithMove(MI, Src1Idx);
6664 return;
6665 }
6666
6667 MI.setDesc(get(CommutedOpc));
6668
6669 Register Src0Reg = Src0.getReg();
6670 unsigned Src0SubReg = Src0.getSubReg();
6671 bool Src0Kill = Src0.isKill();
6672
6673 if (Src1.isImm())
6674 Src0.ChangeToImmediate(Src1.getImm());
6675 else if (Src1.isReg()) {
6676 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6677 Src0.setSubReg(Src1.getSubReg());
6678 } else
6679 llvm_unreachable("Should only have register or immediate operands");
6680
6681 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6682 Src1.setSubReg(Src0SubReg);
6684}
6685
6686// Legalize VOP3 operands. All operand types are supported for any operand
6687// but only one literal constant and only starting from GFX10.
6689 MachineInstr &MI) const {
6690 unsigned Opc = MI.getOpcode();
6691
6692 int VOP3Idx[3] = {
6693 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6694 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6695 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6696 };
6697
6698 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6699 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6700 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6701 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6702 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6703 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6704 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6705 // src1 and src2 must be scalar
6706 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6707 const DebugLoc &DL = MI.getDebugLoc();
6708 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6709 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6710 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6711 .add(Src1);
6712 Src1.ChangeToRegister(Reg, false);
6713 }
6714 if (VOP3Idx[2] != -1) {
6715 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6716 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6717 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6718 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6719 .add(Src2);
6720 Src2.ChangeToRegister(Reg, false);
6721 }
6722 }
6723 }
6724
6725 // Find the one SGPR operand we are allowed to use.
6726 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6727 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6728 SmallDenseSet<unsigned> SGPRsUsed;
6729 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6730 if (SGPRReg) {
6731 SGPRsUsed.insert(SGPRReg);
6732 --ConstantBusLimit;
6733 }
6734
6735 for (int Idx : VOP3Idx) {
6736 if (Idx == -1)
6737 break;
6738 MachineOperand &MO = MI.getOperand(Idx);
6739
6740 if (!MO.isReg()) {
6741 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6742 continue;
6743
6744 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6745 --LiteralLimit;
6746 --ConstantBusLimit;
6747 continue;
6748 }
6749
6750 --LiteralLimit;
6751 --ConstantBusLimit;
6752 legalizeOpWithMove(MI, Idx);
6753 continue;
6754 }
6755
6756 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6757 continue; // VGPRs are legal
6758
6759 // We can use one SGPR in each VOP3 instruction prior to GFX10
6760 // and two starting from GFX10.
6761 if (SGPRsUsed.count(MO.getReg()))
6762 continue;
6763 if (ConstantBusLimit > 0) {
6764 SGPRsUsed.insert(MO.getReg());
6765 --ConstantBusLimit;
6766 continue;
6767 }
6768
6769 // If we make it this far, then the operand is not legal and we must
6770 // legalize it.
6771 legalizeOpWithMove(MI, Idx);
6772 }
6773
6774 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6775 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6776 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6777 legalizeOpWithMove(MI, VOP3Idx[2]);
6778
6779 // Fix the register class of packed FP32 instructions on gfx12+. See
6780 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6782 for (unsigned I = 0; I < 3; ++I) {
6784 legalizeOpWithMove(MI, VOP3Idx[I]);
6785 }
6786 }
6787}
6788
6791 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6792 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6793 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6794 if (DstRC)
6795 SRC = RI.getCommonSubClass(SRC, DstRC);
6796
6797 Register DstReg = MRI.createVirtualRegister(SRC);
6798 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6799
6800 if (RI.hasAGPRs(VRC)) {
6801 VRC = RI.getEquivalentVGPRClass(VRC);
6802 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6803 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6804 get(TargetOpcode::COPY), NewSrcReg)
6805 .addReg(SrcReg);
6806 SrcReg = NewSrcReg;
6807 }
6808
6809 if (SubRegs == 1) {
6810 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6811 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6812 .addReg(SrcReg);
6813 return DstReg;
6814 }
6815
6817 for (unsigned i = 0; i < SubRegs; ++i) {
6818 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6819 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6820 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6821 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6822 SRegs.push_back(SGPR);
6823 }
6824
6826 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6827 get(AMDGPU::REG_SEQUENCE), DstReg);
6828 for (unsigned i = 0; i < SubRegs; ++i) {
6829 MIB.addReg(SRegs[i]);
6830 MIB.addImm(RI.getSubRegFromChannel(i));
6831 }
6832 return DstReg;
6833}
6834
6836 MachineInstr &MI) const {
6837
6838 // If the pointer is store in VGPRs, then we need to move them to
6839 // SGPRs using v_readfirstlane. This is safe because we only select
6840 // loads with uniform pointers to SMRD instruction so we know the
6841 // pointer value is uniform.
6842 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6843 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6844 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6845 SBase->setReg(SGPR);
6846 }
6847 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6848 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6849 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6850 SOff->setReg(SGPR);
6851 }
6852}
6853
6855 unsigned Opc = Inst.getOpcode();
6856 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6857 if (OldSAddrIdx < 0)
6858 return false;
6859
6860 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6861
6862 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6863 if (NewOpc < 0)
6865 if (NewOpc < 0)
6866 return false;
6867
6869 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6870 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6871 return false;
6872
6873 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6874 if (NewVAddrIdx < 0)
6875 return false;
6876
6877 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6878
6879 // Check vaddr, it shall be zero or absent.
6880 MachineInstr *VAddrDef = nullptr;
6881 if (OldVAddrIdx >= 0) {
6882 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6883 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6884 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6885 !VAddrDef->getOperand(1).isImm() ||
6886 VAddrDef->getOperand(1).getImm() != 0)
6887 return false;
6888 }
6889
6890 const MCInstrDesc &NewDesc = get(NewOpc);
6891 Inst.setDesc(NewDesc);
6892
6893 // Callers expect iterator to be valid after this call, so modify the
6894 // instruction in place.
6895 if (OldVAddrIdx == NewVAddrIdx) {
6896 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6897 // Clear use list from the old vaddr holding a zero register.
6898 MRI.removeRegOperandFromUseList(&NewVAddr);
6899 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6900 Inst.removeOperand(OldSAddrIdx);
6901 // Update the use list with the pointer we have just moved from vaddr to
6902 // saddr position. Otherwise new vaddr will be missing from the use list.
6903 MRI.removeRegOperandFromUseList(&NewVAddr);
6904 MRI.addRegOperandToUseList(&NewVAddr);
6905 } else {
6906 assert(OldSAddrIdx == NewVAddrIdx);
6907
6908 if (OldVAddrIdx >= 0) {
6909 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6910 AMDGPU::OpName::vdst_in);
6911
6912 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6913 // it asserts. Untie the operands for now and retie them afterwards.
6914 if (NewVDstIn != -1) {
6915 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6916 Inst.untieRegOperand(OldVDstIn);
6917 }
6918
6919 Inst.removeOperand(OldVAddrIdx);
6920
6921 if (NewVDstIn != -1) {
6922 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6923 Inst.tieOperands(NewVDst, NewVDstIn);
6924 }
6925 }
6926 }
6927
6928 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6929 VAddrDef->eraseFromParent();
6930
6931 return true;
6932}
6933
6934// FIXME: Remove this when SelectionDAG is obsoleted.
6936 MachineInstr &MI) const {
6937 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6938 return;
6939
6940 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6941 // thinks they are uniform, so a readfirstlane should be valid.
6942 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6943 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6944 return;
6945
6947 return;
6948
6949 const TargetRegisterClass *DeclaredRC =
6950 getRegClass(MI.getDesc(), SAddr->getOperandNo());
6951
6952 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6953 SAddr->setReg(ToSGPR);
6954}
6955
6958 const TargetRegisterClass *DstRC,
6961 const DebugLoc &DL) const {
6962 Register OpReg = Op.getReg();
6963 unsigned OpSubReg = Op.getSubReg();
6964
6965 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6966 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6967
6968 // Check if operand is already the correct register class.
6969 if (DstRC == OpRC)
6970 return;
6971
6972 Register DstReg = MRI.createVirtualRegister(DstRC);
6973 auto Copy =
6974 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6975 Op.setReg(DstReg);
6976
6977 MachineInstr *Def = MRI.getVRegDef(OpReg);
6978 if (!Def)
6979 return;
6980
6981 // Try to eliminate the copy if it is copying an immediate value.
6982 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6983 foldImmediate(*Copy, *Def, OpReg, &MRI);
6984
6985 bool ImpDef = Def->isImplicitDef();
6986 while (!ImpDef && Def && Def->isCopy()) {
6987 if (Def->getOperand(1).getReg().isPhysical())
6988 break;
6989 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6990 ImpDef = Def && Def->isImplicitDef();
6991 }
6992 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6993 !ImpDef)
6994 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6995}
6996
6997// Emit the actual waterfall loop, executing the wrapped instruction for each
6998// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6999// iteration, in the worst case we execute 64 (once per lane).
7000static void
7003 MachineBasicBlock &LoopBB,
7004 MachineBasicBlock &BodyBB,
7005 const DebugLoc &DL,
7006 ArrayRef<MachineOperand *> ScalarOps) {
7007 MachineFunction &MF = *LoopBB.getParent();
7008 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7009 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7011 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7012
7014 Register CondReg;
7015
7016 for (MachineOperand *ScalarOp : ScalarOps) {
7017 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7018 unsigned NumSubRegs = RegSize / 32;
7019 Register VScalarOp = ScalarOp->getReg();
7020
7021 if (NumSubRegs == 1) {
7022 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7023
7024 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7025 .addReg(VScalarOp);
7026
7027 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7028
7029 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7030 .addReg(CurReg)
7031 .addReg(VScalarOp);
7032
7033 // Combine the comparison results with AND.
7034 if (!CondReg) // First.
7035 CondReg = NewCondReg;
7036 else { // If not the first, we create an AND.
7037 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7038 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7039 .addReg(CondReg)
7040 .addReg(NewCondReg);
7041 CondReg = AndReg;
7042 }
7043
7044 // Update ScalarOp operand to use the SGPR ScalarOp.
7045 ScalarOp->setReg(CurReg);
7046 ScalarOp->setIsKill();
7047 } else {
7048 SmallVector<Register, 8> ReadlanePieces;
7049 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7050 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7051 "Unhandled register size");
7052
7053 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7054 Register CurRegLo =
7055 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7056 Register CurRegHi =
7057 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7058
7059 // Read the next variant <- also loop target.
7060 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7061 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7062
7063 // Read the next variant <- also loop target.
7064 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7065 .addReg(VScalarOp, VScalarOpUndef,
7066 TRI->getSubRegFromChannel(Idx + 1));
7067
7068 ReadlanePieces.push_back(CurRegLo);
7069 ReadlanePieces.push_back(CurRegHi);
7070
7071 // Comparison is to be done as 64-bit.
7072 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7073 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7074 .addReg(CurRegLo)
7075 .addImm(AMDGPU::sub0)
7076 .addReg(CurRegHi)
7077 .addImm(AMDGPU::sub1);
7078
7079 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7080 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7081 NewCondReg)
7082 .addReg(CurReg);
7083 if (NumSubRegs <= 2)
7084 Cmp.addReg(VScalarOp);
7085 else
7086 Cmp.addReg(VScalarOp, VScalarOpUndef,
7087 TRI->getSubRegFromChannel(Idx, 2));
7088
7089 // Combine the comparison results with AND.
7090 if (!CondReg) // First.
7091 CondReg = NewCondReg;
7092 else { // If not the first, we create an AND.
7093 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7094 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7095 .addReg(CondReg)
7096 .addReg(NewCondReg);
7097 CondReg = AndReg;
7098 }
7099 } // End for loop.
7100
7101 const auto *SScalarOpRC =
7102 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7103 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7104
7105 // Build scalar ScalarOp.
7106 auto Merge =
7107 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7108 unsigned Channel = 0;
7109 for (Register Piece : ReadlanePieces) {
7110 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7111 }
7112
7113 // Update ScalarOp operand to use the SGPR ScalarOp.
7114 ScalarOp->setReg(SScalarOp);
7115 ScalarOp->setIsKill();
7116 }
7117 }
7118
7119 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7120 MRI.setSimpleHint(SaveExec, CondReg);
7121
7122 // Update EXEC to matching lanes, saving original to SaveExec.
7123 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7124 .addReg(CondReg, RegState::Kill);
7125
7126 // The original instruction is here; we insert the terminators after it.
7127 I = BodyBB.end();
7128
7129 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7130 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7131 .addReg(LMC.ExecReg)
7132 .addReg(SaveExec);
7133
7134 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7135}
7136
7137// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7138// with SGPRs by iterating over all unique values across all lanes.
7139// Returns the loop basic block that now contains \p MI.
7140static MachineBasicBlock *
7144 MachineBasicBlock::iterator Begin = nullptr,
7145 MachineBasicBlock::iterator End = nullptr) {
7146 MachineBasicBlock &MBB = *MI.getParent();
7147 MachineFunction &MF = *MBB.getParent();
7148 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7149 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7151 if (!Begin.isValid())
7152 Begin = &MI;
7153 if (!End.isValid()) {
7154 End = &MI;
7155 ++End;
7156 }
7157 const DebugLoc &DL = MI.getDebugLoc();
7159 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7160
7161 // Save SCC. Waterfall Loop may overwrite SCC.
7162 Register SaveSCCReg;
7163
7164 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7165 // rather than unlimited scan everywhere
7166 bool SCCNotDead =
7167 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7168 std::numeric_limits<unsigned>::max()) !=
7170 if (SCCNotDead) {
7171 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7172 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7173 .addImm(1)
7174 .addImm(0);
7175 }
7176
7177 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7178
7179 // Save the EXEC mask
7180 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7181
7182 // Killed uses in the instruction we are waterfalling around will be
7183 // incorrect due to the added control-flow.
7185 ++AfterMI;
7186 for (auto I = Begin; I != AfterMI; I++) {
7187 for (auto &MO : I->all_uses())
7188 MRI.clearKillFlags(MO.getReg());
7189 }
7190
7191 // To insert the loop we need to split the block. Move everything after this
7192 // point to a new block, and insert a new empty block between the two.
7195 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7197 ++MBBI;
7198
7199 MF.insert(MBBI, LoopBB);
7200 MF.insert(MBBI, BodyBB);
7201 MF.insert(MBBI, RemainderBB);
7202
7203 LoopBB->addSuccessor(BodyBB);
7204 BodyBB->addSuccessor(LoopBB);
7205 BodyBB->addSuccessor(RemainderBB);
7206
7207 // Move Begin to MI to the BodyBB, and the remainder of the block to
7208 // RemainderBB.
7209 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7210 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7211 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7212
7213 MBB.addSuccessor(LoopBB);
7214
7215 // Update dominators. We know that MBB immediately dominates LoopBB, that
7216 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7217 // RemainderBB. RemainderBB immediately dominates all of the successors
7218 // transferred to it from MBB that MBB used to properly dominate.
7219 if (MDT) {
7220 MDT->addNewBlock(LoopBB, &MBB);
7221 MDT->addNewBlock(BodyBB, LoopBB);
7222 MDT->addNewBlock(RemainderBB, BodyBB);
7223 for (auto &Succ : RemainderBB->successors()) {
7224 if (MDT->properlyDominates(&MBB, Succ)) {
7225 MDT->changeImmediateDominator(Succ, RemainderBB);
7226 }
7227 }
7228 }
7229
7230 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7231
7232 MachineBasicBlock::iterator First = RemainderBB->begin();
7233 // Restore SCC
7234 if (SCCNotDead) {
7235 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7236 .addReg(SaveSCCReg, RegState::Kill)
7237 .addImm(0);
7238 }
7239
7240 // Restore the EXEC mask
7241 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7242 .addReg(SaveExec);
7243 return BodyBB;
7244}
7245
7246// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7247static std::tuple<unsigned, unsigned>
7249 MachineBasicBlock &MBB = *MI.getParent();
7250 MachineFunction &MF = *MBB.getParent();
7252
7253 // Extract the ptr from the resource descriptor.
7254 unsigned RsrcPtr =
7255 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7256 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7257
7258 // Create an empty resource descriptor
7259 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7260 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7261 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7262 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7263 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7264
7265 // Zero64 = 0
7266 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7267 .addImm(0);
7268
7269 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7270 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7271 .addImm(Lo_32(RsrcDataFormat));
7272
7273 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7274 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7275 .addImm(Hi_32(RsrcDataFormat));
7276
7277 // NewSRsrc = {Zero64, SRsrcFormat}
7278 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7279 .addReg(Zero64)
7280 .addImm(AMDGPU::sub0_sub1)
7281 .addReg(SRsrcFormatLo)
7282 .addImm(AMDGPU::sub2)
7283 .addReg(SRsrcFormatHi)
7284 .addImm(AMDGPU::sub3);
7285
7286 return std::tuple(RsrcPtr, NewSRsrc);
7287}
7288
7291 MachineDominatorTree *MDT) const {
7292 MachineFunction &MF = *MI.getMF();
7294 MachineBasicBlock *CreatedBB = nullptr;
7295
7296 // Legalize VOP2
7297 if (isVOP2(MI) || isVOPC(MI)) {
7299 return CreatedBB;
7300 }
7301
7302 // Legalize VOP3
7303 if (isVOP3(MI)) {
7305 return CreatedBB;
7306 }
7307
7308 // Legalize SMRD
7309 if (isSMRD(MI)) {
7311 return CreatedBB;
7312 }
7313
7314 // Legalize FLAT
7315 if (isFLAT(MI)) {
7317 return CreatedBB;
7318 }
7319
7320 // Legalize REG_SEQUENCE and PHI
7321 // The register class of the operands much be the same type as the register
7322 // class of the output.
7323 if (MI.getOpcode() == AMDGPU::PHI) {
7324 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7325 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7326 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7327 continue;
7328 const TargetRegisterClass *OpRC =
7329 MRI.getRegClass(MI.getOperand(i).getReg());
7330 if (RI.hasVectorRegisters(OpRC)) {
7331 VRC = OpRC;
7332 } else {
7333 SRC = OpRC;
7334 }
7335 }
7336
7337 // If any of the operands are VGPR registers, then they all most be
7338 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7339 // them.
7340 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7341 if (!VRC) {
7342 assert(SRC);
7343 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7344 VRC = &AMDGPU::VReg_1RegClass;
7345 } else
7346 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7347 ? RI.getEquivalentAGPRClass(SRC)
7348 : RI.getEquivalentVGPRClass(SRC);
7349 } else {
7350 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7351 ? RI.getEquivalentAGPRClass(VRC)
7352 : RI.getEquivalentVGPRClass(VRC);
7353 }
7354 RC = VRC;
7355 } else {
7356 RC = SRC;
7357 }
7358
7359 // Update all the operands so they have the same type.
7360 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7361 MachineOperand &Op = MI.getOperand(I);
7362 if (!Op.isReg() || !Op.getReg().isVirtual())
7363 continue;
7364
7365 // MI is a PHI instruction.
7366 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7368
7369 // Avoid creating no-op copies with the same src and dst reg class. These
7370 // confuse some of the machine passes.
7371 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7372 }
7373 }
7374
7375 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7376 // VGPR dest type and SGPR sources, insert copies so all operands are
7377 // VGPRs. This seems to help operand folding / the register coalescer.
7378 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7379 MachineBasicBlock *MBB = MI.getParent();
7380 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7381 if (RI.hasVGPRs(DstRC)) {
7382 // Update all the operands so they are VGPR register classes. These may
7383 // not be the same register class because REG_SEQUENCE supports mixing
7384 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7385 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7386 MachineOperand &Op = MI.getOperand(I);
7387 if (!Op.isReg() || !Op.getReg().isVirtual())
7388 continue;
7389
7390 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7391 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7392 if (VRC == OpRC)
7393 continue;
7394
7395 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7396 Op.setIsKill();
7397 }
7398 }
7399
7400 return CreatedBB;
7401 }
7402
7403 // Legalize INSERT_SUBREG
7404 // src0 must have the same register class as dst
7405 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7406 Register Dst = MI.getOperand(0).getReg();
7407 Register Src0 = MI.getOperand(1).getReg();
7408 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7409 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7410 if (DstRC != Src0RC) {
7411 MachineBasicBlock *MBB = MI.getParent();
7412 MachineOperand &Op = MI.getOperand(1);
7413 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7414 }
7415 return CreatedBB;
7416 }
7417
7418 // Legalize SI_INIT_M0
7419 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7420 MachineOperand &Src = MI.getOperand(0);
7421 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7422 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7423 return CreatedBB;
7424 }
7425
7426 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7427 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7428 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7429 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7430 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7431 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7432 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7433 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7434 MachineOperand &Src = MI.getOperand(1);
7435 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7436 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7437 return CreatedBB;
7438 }
7439
7440 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7441 //
7442 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7443 // scratch memory access. In both cases, the legalization never involves
7444 // conversion to the addr64 form.
7446 (isMUBUF(MI) || isMTBUF(MI)))) {
7447 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7448 ? AMDGPU::OpName::rsrc
7449 : AMDGPU::OpName::srsrc;
7450 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7451 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7452 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7453
7454 AMDGPU::OpName SampOpName =
7455 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7456 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7457 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7458 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7459
7460 return CreatedBB;
7461 }
7462
7463 // Legalize SI_CALL
7464 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7465 MachineOperand *Dest = &MI.getOperand(0);
7466 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7467 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7468 // following copies, we also need to move copies from and to physical
7469 // registers into the loop block.
7470 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7471 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7472
7473 // Also move the copies to physical registers into the loop block
7474 MachineBasicBlock &MBB = *MI.getParent();
7476 while (Start->getOpcode() != FrameSetupOpcode)
7477 --Start;
7479 while (End->getOpcode() != FrameDestroyOpcode)
7480 ++End;
7481 // Also include following copies of the return value
7482 ++End;
7483 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7484 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7485 ++End;
7486 CreatedBB =
7487 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7488 }
7489 }
7490
7491 // Legalize s_sleep_var.
7492 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7493 const DebugLoc &DL = MI.getDebugLoc();
7494 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7495 int Src0Idx =
7496 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7497 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7498 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7499 .add(Src0);
7500 Src0.ChangeToRegister(Reg, false);
7501 return nullptr;
7502 }
7503
7504 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7505 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7506 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7507 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7508 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7509 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7510 for (MachineOperand &Src : MI.explicit_operands()) {
7511 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7512 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7513 }
7514 return CreatedBB;
7515 }
7516
7517 // Legalize MUBUF instructions.
7518 bool isSoffsetLegal = true;
7519 int SoffsetIdx =
7520 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7521 if (SoffsetIdx != -1) {
7522 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7523 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7524 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7525 isSoffsetLegal = false;
7526 }
7527 }
7528
7529 bool isRsrcLegal = true;
7530 int RsrcIdx =
7531 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7532 if (RsrcIdx != -1) {
7533 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7534 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7535 isRsrcLegal = false;
7536 }
7537
7538 // The operands are legal.
7539 if (isRsrcLegal && isSoffsetLegal)
7540 return CreatedBB;
7541
7542 if (!isRsrcLegal) {
7543 // Legalize a VGPR Rsrc
7544 //
7545 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7546 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7547 // a zero-value SRsrc.
7548 //
7549 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7550 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7551 // above.
7552 //
7553 // Otherwise we are on non-ADDR64 hardware, and/or we have
7554 // idxen/offen/bothen and we fall back to a waterfall loop.
7555
7556 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7557 MachineBasicBlock &MBB = *MI.getParent();
7558
7559 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7560 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7561 // This is already an ADDR64 instruction so we need to add the pointer
7562 // extracted from the resource descriptor to the current value of VAddr.
7563 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7564 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7565 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7566
7567 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7568 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7569 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7570
7571 unsigned RsrcPtr, NewSRsrc;
7572 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7573
7574 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7575 const DebugLoc &DL = MI.getDebugLoc();
7576 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7577 .addDef(CondReg0)
7578 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7579 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7580 .addImm(0);
7581
7582 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7583 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7584 .addDef(CondReg1, RegState::Dead)
7585 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7586 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7587 .addReg(CondReg0, RegState::Kill)
7588 .addImm(0);
7589
7590 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7591 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7592 .addReg(NewVAddrLo)
7593 .addImm(AMDGPU::sub0)
7594 .addReg(NewVAddrHi)
7595 .addImm(AMDGPU::sub1);
7596
7597 VAddr->setReg(NewVAddr);
7598 Rsrc->setReg(NewSRsrc);
7599 } else if (!VAddr && ST.hasAddr64()) {
7600 // This instructions is the _OFFSET variant, so we need to convert it to
7601 // ADDR64.
7602 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7603 "FIXME: Need to emit flat atomics here");
7604
7605 unsigned RsrcPtr, NewSRsrc;
7606 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7607
7608 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7609 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7610 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7611 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7612 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7613
7614 // Atomics with return have an additional tied operand and are
7615 // missing some of the special bits.
7616 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7617 MachineInstr *Addr64;
7618
7619 if (!VDataIn) {
7620 // Regular buffer load / store.
7622 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7623 .add(*VData)
7624 .addReg(NewVAddr)
7625 .addReg(NewSRsrc)
7626 .add(*SOffset)
7627 .add(*Offset);
7628
7629 if (const MachineOperand *CPol =
7630 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7631 MIB.addImm(CPol->getImm());
7632 }
7633
7634 if (const MachineOperand *TFE =
7635 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7636 MIB.addImm(TFE->getImm());
7637 }
7638
7639 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7640
7641 MIB.cloneMemRefs(MI);
7642 Addr64 = MIB;
7643 } else {
7644 // Atomics with return.
7645 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7646 .add(*VData)
7647 .add(*VDataIn)
7648 .addReg(NewVAddr)
7649 .addReg(NewSRsrc)
7650 .add(*SOffset)
7651 .add(*Offset)
7652 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7653 .cloneMemRefs(MI);
7654 }
7655
7656 MI.removeFromParent();
7657
7658 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7659 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7660 NewVAddr)
7661 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7662 .addImm(AMDGPU::sub0)
7663 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7664 .addImm(AMDGPU::sub1);
7665 } else {
7666 // Legalize a VGPR Rsrc and soffset together.
7667 if (!isSoffsetLegal) {
7668 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7669 CreatedBB =
7670 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7671 return CreatedBB;
7672 }
7673 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7674 return CreatedBB;
7675 }
7676 }
7677
7678 // Legalize a VGPR soffset.
7679 if (!isSoffsetLegal) {
7680 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7681 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7682 return CreatedBB;
7683 }
7684 return CreatedBB;
7685}
7686
7688 InstrList.insert(MI);
7689 // Add MBUF instructiosn to deferred list.
7690 int RsrcIdx =
7691 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7692 if (RsrcIdx != -1) {
7693 DeferredList.insert(MI);
7694 }
7695}
7696
7698 return DeferredList.contains(MI);
7699}
7700
7701// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7702// lowering (change spgr to vgpr).
7703// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7704// size. Need to legalize the size of the operands during the vgpr lowering
7705// chain. This can be removed after we have sgpr16 in place
7707 MachineRegisterInfo &MRI) const {
7708 if (!ST.useRealTrue16Insts())
7709 return;
7710
7711 unsigned Opcode = MI.getOpcode();
7712 MachineBasicBlock *MBB = MI.getParent();
7713 // Legalize operands and check for size mismatch
7714 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7715 OpIdx >= get(Opcode).getNumOperands() ||
7716 get(Opcode).operands()[OpIdx].RegClass == -1)
7717 return;
7718
7719 MachineOperand &Op = MI.getOperand(OpIdx);
7720 if (!Op.isReg() || !Op.getReg().isVirtual())
7721 return;
7722
7723 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7724 if (!RI.isVGPRClass(CurrRC))
7725 return;
7726
7727 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7728 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7729 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7730 Op.setSubReg(AMDGPU::lo16);
7731 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7732 const DebugLoc &DL = MI.getDebugLoc();
7733 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7734 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7735 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7736 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7737 .addReg(Op.getReg())
7738 .addImm(AMDGPU::lo16)
7739 .addReg(Undef)
7740 .addImm(AMDGPU::hi16);
7741 Op.setReg(NewDstReg);
7742 }
7743}
7745 MachineRegisterInfo &MRI) const {
7746 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7748}
7749
7751 MachineDominatorTree *MDT) const {
7752
7753 while (!Worklist.empty()) {
7754 MachineInstr &Inst = *Worklist.top();
7755 Worklist.erase_top();
7756 // Skip MachineInstr in the deferred list.
7757 if (Worklist.isDeferred(&Inst))
7758 continue;
7759 moveToVALUImpl(Worklist, MDT, Inst);
7760 }
7761
7762 // Deferred list of instructions will be processed once
7763 // all the MachineInstr in the worklist are done.
7764 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7765 moveToVALUImpl(Worklist, MDT, *Inst);
7766 assert(Worklist.empty() &&
7767 "Deferred MachineInstr are not supposed to re-populate worklist");
7768 }
7769}
7770
7773 MachineInstr &Inst) const {
7774
7776 if (!MBB)
7777 return;
7778 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7779 unsigned Opcode = Inst.getOpcode();
7780 unsigned NewOpcode = getVALUOp(Inst);
7781 const DebugLoc &DL = Inst.getDebugLoc();
7782
7783 // Handle some special cases
7784 switch (Opcode) {
7785 default:
7786 break;
7787 case AMDGPU::S_ADD_I32:
7788 case AMDGPU::S_SUB_I32: {
7789 // FIXME: The u32 versions currently selected use the carry.
7790 bool Changed;
7791 MachineBasicBlock *CreatedBBTmp = nullptr;
7792 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7793 if (Changed)
7794 return;
7795
7796 // Default handling
7797 break;
7798 }
7799
7800 case AMDGPU::S_MUL_U64:
7801 if (ST.hasVectorMulU64()) {
7802 NewOpcode = AMDGPU::V_MUL_U64_e64;
7803 break;
7804 }
7805 // Split s_mul_u64 in 32-bit vector multiplications.
7806 splitScalarSMulU64(Worklist, Inst, MDT);
7807 Inst.eraseFromParent();
7808 return;
7809
7810 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7811 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7812 // This is a special case of s_mul_u64 where all the operands are either
7813 // zero extended or sign extended.
7814 splitScalarSMulPseudo(Worklist, Inst, MDT);
7815 Inst.eraseFromParent();
7816 return;
7817
7818 case AMDGPU::S_AND_B64:
7819 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7820 Inst.eraseFromParent();
7821 return;
7822
7823 case AMDGPU::S_OR_B64:
7824 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7825 Inst.eraseFromParent();
7826 return;
7827
7828 case AMDGPU::S_XOR_B64:
7829 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7830 Inst.eraseFromParent();
7831 return;
7832
7833 case AMDGPU::S_NAND_B64:
7834 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7835 Inst.eraseFromParent();
7836 return;
7837
7838 case AMDGPU::S_NOR_B64:
7839 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7840 Inst.eraseFromParent();
7841 return;
7842
7843 case AMDGPU::S_XNOR_B64:
7844 if (ST.hasDLInsts())
7845 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7846 else
7847 splitScalar64BitXnor(Worklist, Inst, MDT);
7848 Inst.eraseFromParent();
7849 return;
7850
7851 case AMDGPU::S_ANDN2_B64:
7852 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7853 Inst.eraseFromParent();
7854 return;
7855
7856 case AMDGPU::S_ORN2_B64:
7857 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7858 Inst.eraseFromParent();
7859 return;
7860
7861 case AMDGPU::S_BREV_B64:
7862 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7863 Inst.eraseFromParent();
7864 return;
7865
7866 case AMDGPU::S_NOT_B64:
7867 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7868 Inst.eraseFromParent();
7869 return;
7870
7871 case AMDGPU::S_BCNT1_I32_B64:
7872 splitScalar64BitBCNT(Worklist, Inst);
7873 Inst.eraseFromParent();
7874 return;
7875
7876 case AMDGPU::S_BFE_I64:
7877 splitScalar64BitBFE(Worklist, Inst);
7878 Inst.eraseFromParent();
7879 return;
7880
7881 case AMDGPU::S_FLBIT_I32_B64:
7882 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7883 Inst.eraseFromParent();
7884 return;
7885 case AMDGPU::S_FF1_I32_B64:
7886 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7887 Inst.eraseFromParent();
7888 return;
7889
7890 case AMDGPU::S_LSHL_B32:
7891 if (ST.hasOnlyRevVALUShifts()) {
7892 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7893 swapOperands(Inst);
7894 }
7895 break;
7896 case AMDGPU::S_ASHR_I32:
7897 if (ST.hasOnlyRevVALUShifts()) {
7898 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7899 swapOperands(Inst);
7900 }
7901 break;
7902 case AMDGPU::S_LSHR_B32:
7903 if (ST.hasOnlyRevVALUShifts()) {
7904 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7905 swapOperands(Inst);
7906 }
7907 break;
7908 case AMDGPU::S_LSHL_B64:
7909 if (ST.hasOnlyRevVALUShifts()) {
7910 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7911 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7912 : AMDGPU::V_LSHLREV_B64_e64;
7913 swapOperands(Inst);
7914 }
7915 break;
7916 case AMDGPU::S_ASHR_I64:
7917 if (ST.hasOnlyRevVALUShifts()) {
7918 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7919 swapOperands(Inst);
7920 }
7921 break;
7922 case AMDGPU::S_LSHR_B64:
7923 if (ST.hasOnlyRevVALUShifts()) {
7924 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7925 swapOperands(Inst);
7926 }
7927 break;
7928
7929 case AMDGPU::S_ABS_I32:
7930 lowerScalarAbs(Worklist, Inst);
7931 Inst.eraseFromParent();
7932 return;
7933
7934 case AMDGPU::S_ABSDIFF_I32:
7935 lowerScalarAbsDiff(Worklist, Inst);
7936 Inst.eraseFromParent();
7937 return;
7938
7939 case AMDGPU::S_CBRANCH_SCC0:
7940 case AMDGPU::S_CBRANCH_SCC1: {
7941 // Clear unused bits of vcc
7942 Register CondReg = Inst.getOperand(1).getReg();
7943 bool IsSCC = CondReg == AMDGPU::SCC;
7945 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7946 .addReg(LMC.ExecReg)
7947 .addReg(IsSCC ? LMC.VccReg : CondReg);
7948 Inst.removeOperand(1);
7949 } break;
7950
7951 case AMDGPU::S_BFE_U64:
7952 case AMDGPU::S_BFM_B64:
7953 llvm_unreachable("Moving this op to VALU not implemented");
7954
7955 case AMDGPU::S_PACK_LL_B32_B16:
7956 case AMDGPU::S_PACK_LH_B32_B16:
7957 case AMDGPU::S_PACK_HL_B32_B16:
7958 case AMDGPU::S_PACK_HH_B32_B16:
7959 movePackToVALU(Worklist, MRI, Inst);
7960 Inst.eraseFromParent();
7961 return;
7962
7963 case AMDGPU::S_XNOR_B32:
7964 lowerScalarXnor(Worklist, Inst);
7965 Inst.eraseFromParent();
7966 return;
7967
7968 case AMDGPU::S_NAND_B32:
7969 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7970 Inst.eraseFromParent();
7971 return;
7972
7973 case AMDGPU::S_NOR_B32:
7974 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7975 Inst.eraseFromParent();
7976 return;
7977
7978 case AMDGPU::S_ANDN2_B32:
7979 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7980 Inst.eraseFromParent();
7981 return;
7982
7983 case AMDGPU::S_ORN2_B32:
7984 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7985 Inst.eraseFromParent();
7986 return;
7987
7988 // TODO: remove as soon as everything is ready
7989 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7990 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7991 // can only be selected from the uniform SDNode.
7992 case AMDGPU::S_ADD_CO_PSEUDO:
7993 case AMDGPU::S_SUB_CO_PSEUDO: {
7994 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7995 ? AMDGPU::V_ADDC_U32_e64
7996 : AMDGPU::V_SUBB_U32_e64;
7997 const auto *CarryRC = RI.getWaveMaskRegClass();
7998
7999 Register CarryInReg = Inst.getOperand(4).getReg();
8000 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
8001 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
8002 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
8003 .addReg(CarryInReg);
8004 }
8005
8006 Register CarryOutReg = Inst.getOperand(1).getReg();
8007
8008 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8009 MRI.getRegClass(Inst.getOperand(0).getReg())));
8010 MachineInstr *CarryOp =
8011 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8012 .addReg(CarryOutReg, RegState::Define)
8013 .add(Inst.getOperand(2))
8014 .add(Inst.getOperand(3))
8015 .addReg(CarryInReg)
8016 .addImm(0);
8017 legalizeOperands(*CarryOp);
8018 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8019 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8020 Inst.eraseFromParent();
8021 }
8022 return;
8023 case AMDGPU::S_UADDO_PSEUDO:
8024 case AMDGPU::S_USUBO_PSEUDO: {
8025 MachineOperand &Dest0 = Inst.getOperand(0);
8026 MachineOperand &Dest1 = Inst.getOperand(1);
8027 MachineOperand &Src0 = Inst.getOperand(2);
8028 MachineOperand &Src1 = Inst.getOperand(3);
8029
8030 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8031 ? AMDGPU::V_ADD_CO_U32_e64
8032 : AMDGPU::V_SUB_CO_U32_e64;
8033 const TargetRegisterClass *NewRC =
8034 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8035 Register DestReg = MRI.createVirtualRegister(NewRC);
8036 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8037 .addReg(Dest1.getReg(), RegState::Define)
8038 .add(Src0)
8039 .add(Src1)
8040 .addImm(0); // clamp bit
8041
8042 legalizeOperands(*NewInstr, MDT);
8043 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8044 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8045 Inst.eraseFromParent();
8046 }
8047 return;
8048 case AMDGPU::S_LSHL1_ADD_U32:
8049 case AMDGPU::S_LSHL2_ADD_U32:
8050 case AMDGPU::S_LSHL3_ADD_U32:
8051 case AMDGPU::S_LSHL4_ADD_U32: {
8052 MachineOperand &Dest = Inst.getOperand(0);
8053 MachineOperand &Src0 = Inst.getOperand(1);
8054 MachineOperand &Src1 = Inst.getOperand(2);
8055 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8056 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8057 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8058 : 4);
8059
8060 const TargetRegisterClass *NewRC =
8061 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8062 Register DestReg = MRI.createVirtualRegister(NewRC);
8063 MachineInstr *NewInstr =
8064 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8065 .add(Src0)
8066 .addImm(ShiftAmt)
8067 .add(Src1);
8068
8069 legalizeOperands(*NewInstr, MDT);
8070 MRI.replaceRegWith(Dest.getReg(), DestReg);
8071 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8072 Inst.eraseFromParent();
8073 }
8074 return;
8075 case AMDGPU::S_CSELECT_B32:
8076 case AMDGPU::S_CSELECT_B64:
8077 lowerSelect(Worklist, Inst, MDT);
8078 Inst.eraseFromParent();
8079 return;
8080 case AMDGPU::S_CMP_EQ_I32:
8081 case AMDGPU::S_CMP_LG_I32:
8082 case AMDGPU::S_CMP_GT_I32:
8083 case AMDGPU::S_CMP_GE_I32:
8084 case AMDGPU::S_CMP_LT_I32:
8085 case AMDGPU::S_CMP_LE_I32:
8086 case AMDGPU::S_CMP_EQ_U32:
8087 case AMDGPU::S_CMP_LG_U32:
8088 case AMDGPU::S_CMP_GT_U32:
8089 case AMDGPU::S_CMP_GE_U32:
8090 case AMDGPU::S_CMP_LT_U32:
8091 case AMDGPU::S_CMP_LE_U32:
8092 case AMDGPU::S_CMP_EQ_U64:
8093 case AMDGPU::S_CMP_LG_U64:
8094 case AMDGPU::S_CMP_LT_F32:
8095 case AMDGPU::S_CMP_EQ_F32:
8096 case AMDGPU::S_CMP_LE_F32:
8097 case AMDGPU::S_CMP_GT_F32:
8098 case AMDGPU::S_CMP_LG_F32:
8099 case AMDGPU::S_CMP_GE_F32:
8100 case AMDGPU::S_CMP_O_F32:
8101 case AMDGPU::S_CMP_U_F32:
8102 case AMDGPU::S_CMP_NGE_F32:
8103 case AMDGPU::S_CMP_NLG_F32:
8104 case AMDGPU::S_CMP_NGT_F32:
8105 case AMDGPU::S_CMP_NLE_F32:
8106 case AMDGPU::S_CMP_NEQ_F32:
8107 case AMDGPU::S_CMP_NLT_F32: {
8108 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8109 auto NewInstr =
8110 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8111 .setMIFlags(Inst.getFlags());
8112 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8113 0) {
8114 NewInstr
8115 .addImm(0) // src0_modifiers
8116 .add(Inst.getOperand(0)) // src0
8117 .addImm(0) // src1_modifiers
8118 .add(Inst.getOperand(1)) // src1
8119 .addImm(0); // clamp
8120 } else {
8121 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8122 }
8123 legalizeOperands(*NewInstr, MDT);
8124 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8125 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8126 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8127 Inst.eraseFromParent();
8128 return;
8129 }
8130 case AMDGPU::S_CMP_LT_F16:
8131 case AMDGPU::S_CMP_EQ_F16:
8132 case AMDGPU::S_CMP_LE_F16:
8133 case AMDGPU::S_CMP_GT_F16:
8134 case AMDGPU::S_CMP_LG_F16:
8135 case AMDGPU::S_CMP_GE_F16:
8136 case AMDGPU::S_CMP_O_F16:
8137 case AMDGPU::S_CMP_U_F16:
8138 case AMDGPU::S_CMP_NGE_F16:
8139 case AMDGPU::S_CMP_NLG_F16:
8140 case AMDGPU::S_CMP_NGT_F16:
8141 case AMDGPU::S_CMP_NLE_F16:
8142 case AMDGPU::S_CMP_NEQ_F16:
8143 case AMDGPU::S_CMP_NLT_F16: {
8144 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8145 auto NewInstr =
8146 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8147 .setMIFlags(Inst.getFlags());
8148 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8149 NewInstr
8150 .addImm(0) // src0_modifiers
8151 .add(Inst.getOperand(0)) // src0
8152 .addImm(0) // src1_modifiers
8153 .add(Inst.getOperand(1)) // src1
8154 .addImm(0); // clamp
8155 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8156 NewInstr.addImm(0); // op_sel0
8157 } else {
8158 NewInstr
8159 .add(Inst.getOperand(0))
8160 .add(Inst.getOperand(1));
8161 }
8162 legalizeOperandsVALUt16(*NewInstr, MRI);
8163 legalizeOperands(*NewInstr, MDT);
8164 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8165 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8166 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8167 Inst.eraseFromParent();
8168 return;
8169 }
8170 case AMDGPU::S_CVT_HI_F32_F16: {
8171 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8172 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8173 if (ST.useRealTrue16Insts()) {
8174 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8175 .add(Inst.getOperand(1));
8176 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8177 .addImm(0) // src0_modifiers
8178 .addReg(TmpReg, 0, AMDGPU::hi16)
8179 .addImm(0) // clamp
8180 .addImm(0) // omod
8181 .addImm(0); // op_sel0
8182 } else {
8183 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8184 .addImm(16)
8185 .add(Inst.getOperand(1));
8186 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8187 .addImm(0) // src0_modifiers
8188 .addReg(TmpReg)
8189 .addImm(0) // clamp
8190 .addImm(0); // omod
8191 }
8192
8193 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8194 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8195 Inst.eraseFromParent();
8196 return;
8197 }
8198 case AMDGPU::S_MINIMUM_F32:
8199 case AMDGPU::S_MAXIMUM_F32: {
8200 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8201 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8202 .addImm(0) // src0_modifiers
8203 .add(Inst.getOperand(1))
8204 .addImm(0) // src1_modifiers
8205 .add(Inst.getOperand(2))
8206 .addImm(0) // clamp
8207 .addImm(0); // omod
8208 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8209
8210 legalizeOperands(*NewInstr, MDT);
8211 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8212 Inst.eraseFromParent();
8213 return;
8214 }
8215 case AMDGPU::S_MINIMUM_F16:
8216 case AMDGPU::S_MAXIMUM_F16: {
8217 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8218 ? &AMDGPU::VGPR_16RegClass
8219 : &AMDGPU::VGPR_32RegClass);
8220 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8221 .addImm(0) // src0_modifiers
8222 .add(Inst.getOperand(1))
8223 .addImm(0) // src1_modifiers
8224 .add(Inst.getOperand(2))
8225 .addImm(0) // clamp
8226 .addImm(0) // omod
8227 .addImm(0); // opsel0
8228 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8229 legalizeOperandsVALUt16(*NewInstr, MRI);
8230 legalizeOperands(*NewInstr, MDT);
8231 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8232 Inst.eraseFromParent();
8233 return;
8234 }
8235 case AMDGPU::V_S_EXP_F16_e64:
8236 case AMDGPU::V_S_LOG_F16_e64:
8237 case AMDGPU::V_S_RCP_F16_e64:
8238 case AMDGPU::V_S_RSQ_F16_e64:
8239 case AMDGPU::V_S_SQRT_F16_e64: {
8240 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8241 ? &AMDGPU::VGPR_16RegClass
8242 : &AMDGPU::VGPR_32RegClass);
8243 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8244 .add(Inst.getOperand(1)) // src0_modifiers
8245 .add(Inst.getOperand(2))
8246 .add(Inst.getOperand(3)) // clamp
8247 .add(Inst.getOperand(4)) // omod
8248 .setMIFlags(Inst.getFlags());
8249 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8250 NewInstr.addImm(0); // opsel0
8251 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8252 legalizeOperandsVALUt16(*NewInstr, MRI);
8253 legalizeOperands(*NewInstr, MDT);
8254 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8255 Inst.eraseFromParent();
8256 return;
8257 }
8258 }
8259
8260 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8261 // We cannot move this instruction to the VALU, so we should try to
8262 // legalize its operands instead.
8263 legalizeOperands(Inst, MDT);
8264 return;
8265 }
8266 // Handle converting generic instructions like COPY-to-SGPR into
8267 // COPY-to-VGPR.
8268 if (NewOpcode == Opcode) {
8269 Register DstReg = Inst.getOperand(0).getReg();
8270 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8271
8272 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8273 // hope for the best.
8274 if (Inst.isCopy() && DstReg.isPhysical() &&
8275 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8276 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8277 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8278 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8279 .add(Inst.getOperand(1));
8280 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8281 DstReg)
8282 .addReg(NewDst);
8283
8284 Inst.eraseFromParent();
8285 return;
8286 }
8287
8288 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8289 Register NewDstReg = Inst.getOperand(1).getReg();
8290 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8291 if (const TargetRegisterClass *CommonRC =
8292 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8293 // Instead of creating a copy where src and dst are the same register
8294 // class, we just replace all uses of dst with src. These kinds of
8295 // copies interfere with the heuristics MachineSink uses to decide
8296 // whether or not to split a critical edge. Since the pass assumes
8297 // that copies will end up as machine instructions and not be
8298 // eliminated.
8299 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8300 MRI.replaceRegWith(DstReg, NewDstReg);
8301 MRI.clearKillFlags(NewDstReg);
8302 Inst.getOperand(0).setReg(DstReg);
8303
8304 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8305 llvm_unreachable("failed to constrain register");
8306
8307 Inst.eraseFromParent();
8308 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8309 for (MachineOperand &MO :
8310 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8311 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8312 }
8313
8314 return;
8315 }
8316 }
8317
8318 // If this is a v2s copy between 16bit and 32bit reg,
8319 // replace vgpr copy to reg_sequence/extract_subreg
8320 // This can be remove after we have sgpr16 in place
8321 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8322 Inst.getOperand(1).getReg().isVirtual() &&
8323 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8324 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8325 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8326 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8327 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8328 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8329 get(AMDGPU::IMPLICIT_DEF), Undef);
8330 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8331 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8332 .addReg(Inst.getOperand(1).getReg())
8333 .addImm(AMDGPU::lo16)
8334 .addReg(Undef)
8335 .addImm(AMDGPU::hi16);
8336 Inst.eraseFromParent();
8337 MRI.replaceRegWith(DstReg, NewDstReg);
8338 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8339 return;
8340 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8341 AMDGPU::lo16)) {
8342 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8343 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8344 MRI.replaceRegWith(DstReg, NewDstReg);
8345 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8346 return;
8347 }
8348 }
8349
8350 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8351 MRI.replaceRegWith(DstReg, NewDstReg);
8352 legalizeOperands(Inst, MDT);
8353 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8354 return;
8355 }
8356
8357 // Use the new VALU Opcode.
8358 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8359 .setMIFlags(Inst.getFlags());
8360 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8361 // Intersperse VOP3 modifiers among the SALU operands.
8362 NewInstr->addOperand(Inst.getOperand(0));
8363 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8364 AMDGPU::OpName::src0_modifiers) >= 0)
8365 NewInstr.addImm(0);
8366 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8367 const MachineOperand &Src = Inst.getOperand(1);
8368 NewInstr->addOperand(Src);
8369 }
8370
8371 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8372 // We are converting these to a BFE, so we need to add the missing
8373 // operands for the size and offset.
8374 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8375 NewInstr.addImm(0);
8376 NewInstr.addImm(Size);
8377 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8378 // The VALU version adds the second operand to the result, so insert an
8379 // extra 0 operand.
8380 NewInstr.addImm(0);
8381 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8382 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8383 // If we need to move this to VGPRs, we need to unpack the second
8384 // operand back into the 2 separate ones for bit offset and width.
8385 assert(OffsetWidthOp.isImm() &&
8386 "Scalar BFE is only implemented for constant width and offset");
8387 uint32_t Imm = OffsetWidthOp.getImm();
8388
8389 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8390 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8391 NewInstr.addImm(Offset);
8392 NewInstr.addImm(BitWidth);
8393 } else {
8394 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8395 AMDGPU::OpName::src1_modifiers) >= 0)
8396 NewInstr.addImm(0);
8397 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8398 NewInstr->addOperand(Inst.getOperand(2));
8399 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8400 AMDGPU::OpName::src2_modifiers) >= 0)
8401 NewInstr.addImm(0);
8402 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8403 NewInstr->addOperand(Inst.getOperand(3));
8404 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8405 NewInstr.addImm(0);
8406 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8407 NewInstr.addImm(0);
8408 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8409 NewInstr.addImm(0);
8410 }
8411 } else {
8412 // Just copy the SALU operands.
8413 for (const MachineOperand &Op : Inst.explicit_operands())
8414 NewInstr->addOperand(Op);
8415 }
8416
8417 // Remove any references to SCC. Vector instructions can't read from it, and
8418 // We're just about to add the implicit use / defs of VCC, and we don't want
8419 // both.
8420 for (MachineOperand &Op : Inst.implicit_operands()) {
8421 if (Op.getReg() == AMDGPU::SCC) {
8422 // Only propagate through live-def of SCC.
8423 if (Op.isDef() && !Op.isDead())
8424 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8425 if (Op.isUse())
8426 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8427 }
8428 }
8429 Inst.eraseFromParent();
8430 Register NewDstReg;
8431 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8432 Register DstReg = NewInstr->getOperand(0).getReg();
8433 assert(DstReg.isVirtual());
8434 // Update the destination register class.
8435 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8436 assert(NewDstRC);
8437 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8438 MRI.replaceRegWith(DstReg, NewDstReg);
8439 }
8440 fixImplicitOperands(*NewInstr);
8441
8442 legalizeOperandsVALUt16(*NewInstr, MRI);
8443
8444 // Legalize the operands
8445 legalizeOperands(*NewInstr, MDT);
8446 if (NewDstReg)
8447 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8448}
8449
8450// Add/sub require special handling to deal with carry outs.
8451std::pair<bool, MachineBasicBlock *>
8452SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8453 MachineDominatorTree *MDT) const {
8454 if (ST.hasAddNoCarry()) {
8455 // Assume there is no user of scc since we don't select this in that case.
8456 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8457 // is used.
8458
8459 MachineBasicBlock &MBB = *Inst.getParent();
8460 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8461
8462 Register OldDstReg = Inst.getOperand(0).getReg();
8463 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8464
8465 unsigned Opc = Inst.getOpcode();
8466 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8467
8468 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8469 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8470
8471 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8472 Inst.removeOperand(3);
8473
8474 Inst.setDesc(get(NewOpc));
8475 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8476 Inst.addImplicitDefUseOperands(*MBB.getParent());
8477 MRI.replaceRegWith(OldDstReg, ResultReg);
8478 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8479
8480 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8481 return std::pair(true, NewBB);
8482 }
8483
8484 return std::pair(false, nullptr);
8485}
8486
8487void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8488 MachineDominatorTree *MDT) const {
8489
8490 MachineBasicBlock &MBB = *Inst.getParent();
8491 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8492 MachineBasicBlock::iterator MII = Inst;
8493 const DebugLoc &DL = Inst.getDebugLoc();
8494
8495 MachineOperand &Dest = Inst.getOperand(0);
8496 MachineOperand &Src0 = Inst.getOperand(1);
8497 MachineOperand &Src1 = Inst.getOperand(2);
8498 MachineOperand &Cond = Inst.getOperand(3);
8499
8500 Register CondReg = Cond.getReg();
8501 bool IsSCC = (CondReg == AMDGPU::SCC);
8502
8503 // If this is a trivial select where the condition is effectively not SCC
8504 // (CondReg is a source of copy to SCC), then the select is semantically
8505 // equivalent to copying CondReg. Hence, there is no need to create
8506 // V_CNDMASK, we can just use that and bail out.
8507 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8508 (Src1.getImm() == 0)) {
8509 MRI.replaceRegWith(Dest.getReg(), CondReg);
8510 return;
8511 }
8512
8513 Register NewCondReg = CondReg;
8514 if (IsSCC) {
8515 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8516 NewCondReg = MRI.createVirtualRegister(TC);
8517
8518 // Now look for the closest SCC def if it is a copy
8519 // replacing the CondReg with the COPY source register
8520 bool CopyFound = false;
8521 for (MachineInstr &CandI :
8523 Inst.getParent()->rend())) {
8524 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8525 -1) {
8526 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8527 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8528 .addReg(CandI.getOperand(1).getReg());
8529 CopyFound = true;
8530 }
8531 break;
8532 }
8533 }
8534 if (!CopyFound) {
8535 // SCC def is not a copy
8536 // Insert a trivial select instead of creating a copy, because a copy from
8537 // SCC would semantically mean just copying a single bit, but we may need
8538 // the result to be a vector condition mask that needs preserving.
8539 unsigned Opcode =
8540 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8541 auto NewSelect =
8542 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8543 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8544 }
8545 }
8546
8547 Register NewDestReg = MRI.createVirtualRegister(
8548 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8549 MachineInstr *NewInst;
8550 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8551 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8552 .addImm(0)
8553 .add(Src1) // False
8554 .addImm(0)
8555 .add(Src0) // True
8556 .addReg(NewCondReg);
8557 } else {
8558 NewInst =
8559 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8560 .add(Src1) // False
8561 .add(Src0) // True
8562 .addReg(NewCondReg);
8563 }
8564 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8565 legalizeOperands(*NewInst, MDT);
8566 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8567}
8568
8569void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8570 MachineInstr &Inst) const {
8571 MachineBasicBlock &MBB = *Inst.getParent();
8572 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8573 MachineBasicBlock::iterator MII = Inst;
8574 const DebugLoc &DL = Inst.getDebugLoc();
8575
8576 MachineOperand &Dest = Inst.getOperand(0);
8577 MachineOperand &Src = Inst.getOperand(1);
8578 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8579 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8580
8581 unsigned SubOp = ST.hasAddNoCarry() ?
8582 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8583
8584 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8585 .addImm(0)
8586 .addReg(Src.getReg());
8587
8588 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8589 .addReg(Src.getReg())
8590 .addReg(TmpReg);
8591
8592 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8593 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8594}
8595
8596void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8597 MachineInstr &Inst) const {
8598 MachineBasicBlock &MBB = *Inst.getParent();
8599 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8600 MachineBasicBlock::iterator MII = Inst;
8601 const DebugLoc &DL = Inst.getDebugLoc();
8602
8603 MachineOperand &Dest = Inst.getOperand(0);
8604 MachineOperand &Src1 = Inst.getOperand(1);
8605 MachineOperand &Src2 = Inst.getOperand(2);
8606 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8607 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8608 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8609
8610 unsigned SubOp =
8611 ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8612
8613 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8614 .addReg(Src1.getReg())
8615 .addReg(Src2.getReg());
8616
8617 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8618
8619 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8620 .addReg(SubResultReg)
8621 .addReg(TmpReg);
8622
8623 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8624 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8625}
8626
8627void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8628 MachineInstr &Inst) const {
8629 MachineBasicBlock &MBB = *Inst.getParent();
8630 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8631 MachineBasicBlock::iterator MII = Inst;
8632 const DebugLoc &DL = Inst.getDebugLoc();
8633
8634 MachineOperand &Dest = Inst.getOperand(0);
8635 MachineOperand &Src0 = Inst.getOperand(1);
8636 MachineOperand &Src1 = Inst.getOperand(2);
8637
8638 if (ST.hasDLInsts()) {
8639 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8640 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8641 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8642
8643 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8644 .add(Src0)
8645 .add(Src1);
8646
8647 MRI.replaceRegWith(Dest.getReg(), NewDest);
8648 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8649 } else {
8650 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8651 // invert either source and then perform the XOR. If either source is a
8652 // scalar register, then we can leave the inversion on the scalar unit to
8653 // achieve a better distribution of scalar and vector instructions.
8654 bool Src0IsSGPR = Src0.isReg() &&
8655 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8656 bool Src1IsSGPR = Src1.isReg() &&
8657 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8658 MachineInstr *Xor;
8659 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8660 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8661
8662 // Build a pair of scalar instructions and add them to the work list.
8663 // The next iteration over the work list will lower these to the vector
8664 // unit as necessary.
8665 if (Src0IsSGPR) {
8666 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8667 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8668 .addReg(Temp)
8669 .add(Src1);
8670 } else if (Src1IsSGPR) {
8671 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8672 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8673 .add(Src0)
8674 .addReg(Temp);
8675 } else {
8676 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8677 .add(Src0)
8678 .add(Src1);
8679 MachineInstr *Not =
8680 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8681 Worklist.insert(Not);
8682 }
8683
8684 MRI.replaceRegWith(Dest.getReg(), NewDest);
8685
8686 Worklist.insert(Xor);
8687
8688 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8689 }
8690}
8691
8692void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8693 MachineInstr &Inst,
8694 unsigned Opcode) const {
8695 MachineBasicBlock &MBB = *Inst.getParent();
8696 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8697 MachineBasicBlock::iterator MII = Inst;
8698 const DebugLoc &DL = Inst.getDebugLoc();
8699
8700 MachineOperand &Dest = Inst.getOperand(0);
8701 MachineOperand &Src0 = Inst.getOperand(1);
8702 MachineOperand &Src1 = Inst.getOperand(2);
8703
8704 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8705 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8706
8707 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8708 .add(Src0)
8709 .add(Src1);
8710
8711 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8712 .addReg(Interm);
8713
8714 Worklist.insert(&Op);
8715 Worklist.insert(&Not);
8716
8717 MRI.replaceRegWith(Dest.getReg(), NewDest);
8718 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8719}
8720
8721void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8722 MachineInstr &Inst,
8723 unsigned Opcode) const {
8724 MachineBasicBlock &MBB = *Inst.getParent();
8725 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8726 MachineBasicBlock::iterator MII = Inst;
8727 const DebugLoc &DL = Inst.getDebugLoc();
8728
8729 MachineOperand &Dest = Inst.getOperand(0);
8730 MachineOperand &Src0 = Inst.getOperand(1);
8731 MachineOperand &Src1 = Inst.getOperand(2);
8732
8733 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8734 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8735
8736 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8737 .add(Src1);
8738
8739 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8740 .add(Src0)
8741 .addReg(Interm);
8742
8743 Worklist.insert(&Not);
8744 Worklist.insert(&Op);
8745
8746 MRI.replaceRegWith(Dest.getReg(), NewDest);
8747 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8748}
8749
8750void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8751 MachineInstr &Inst, unsigned Opcode,
8752 bool Swap) const {
8753 MachineBasicBlock &MBB = *Inst.getParent();
8754 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8755
8756 MachineOperand &Dest = Inst.getOperand(0);
8757 MachineOperand &Src0 = Inst.getOperand(1);
8758 const DebugLoc &DL = Inst.getDebugLoc();
8759
8760 MachineBasicBlock::iterator MII = Inst;
8761
8762 const MCInstrDesc &InstDesc = get(Opcode);
8763 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8764 MRI.getRegClass(Src0.getReg()) :
8765 &AMDGPU::SGPR_32RegClass;
8766
8767 const TargetRegisterClass *Src0SubRC =
8768 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8769
8770 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8771 AMDGPU::sub0, Src0SubRC);
8772
8773 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8774 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8775 const TargetRegisterClass *NewDestSubRC =
8776 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8777
8778 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8779 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8780
8781 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8782 AMDGPU::sub1, Src0SubRC);
8783
8784 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8785 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8786
8787 if (Swap)
8788 std::swap(DestSub0, DestSub1);
8789
8790 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8791 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8792 .addReg(DestSub0)
8793 .addImm(AMDGPU::sub0)
8794 .addReg(DestSub1)
8795 .addImm(AMDGPU::sub1);
8796
8797 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8798
8799 Worklist.insert(&LoHalf);
8800 Worklist.insert(&HiHalf);
8801
8802 // We don't need to legalizeOperands here because for a single operand, src0
8803 // will support any kind of input.
8804
8805 // Move all users of this moved value.
8806 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8807}
8808
8809// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8810// split the s_mul_u64 in 32-bit vector multiplications.
8811void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8812 MachineInstr &Inst,
8813 MachineDominatorTree *MDT) const {
8814 MachineBasicBlock &MBB = *Inst.getParent();
8815 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8816
8817 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8818 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8819 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8820
8821 MachineOperand &Dest = Inst.getOperand(0);
8822 MachineOperand &Src0 = Inst.getOperand(1);
8823 MachineOperand &Src1 = Inst.getOperand(2);
8824 const DebugLoc &DL = Inst.getDebugLoc();
8825 MachineBasicBlock::iterator MII = Inst;
8826
8827 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8828 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8829 const TargetRegisterClass *Src0SubRC =
8830 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8831 if (RI.isSGPRClass(Src0SubRC))
8832 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8833 const TargetRegisterClass *Src1SubRC =
8834 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8835 if (RI.isSGPRClass(Src1SubRC))
8836 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8837
8838 // First, we extract the low 32-bit and high 32-bit values from each of the
8839 // operands.
8840 MachineOperand Op0L =
8841 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8842 MachineOperand Op1L =
8843 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8844 MachineOperand Op0H =
8845 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8846 MachineOperand Op1H =
8847 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8848
8849 // The multilication is done as follows:
8850 //
8851 // Op1H Op1L
8852 // * Op0H Op0L
8853 // --------------------
8854 // Op1H*Op0L Op1L*Op0L
8855 // + Op1H*Op0H Op1L*Op0H
8856 // -----------------------------------------
8857 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8858 //
8859 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8860 // value and that would overflow.
8861 // The low 32-bit value is Op1L*Op0L.
8862 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8863
8864 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8865 MachineInstr *Op1L_Op0H =
8866 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8867 .add(Op1L)
8868 .add(Op0H);
8869
8870 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8871 MachineInstr *Op1H_Op0L =
8872 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8873 .add(Op1H)
8874 .add(Op0L);
8875
8876 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8877 MachineInstr *Carry =
8878 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8879 .add(Op1L)
8880 .add(Op0L);
8881
8882 MachineInstr *LoHalf =
8883 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8884 .add(Op1L)
8885 .add(Op0L);
8886
8887 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8888 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8889 .addReg(Op1L_Op0H_Reg)
8890 .addReg(Op1H_Op0L_Reg);
8891
8892 MachineInstr *HiHalf =
8893 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8894 .addReg(AddReg)
8895 .addReg(CarryReg);
8896
8897 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8898 .addReg(DestSub0)
8899 .addImm(AMDGPU::sub0)
8900 .addReg(DestSub1)
8901 .addImm(AMDGPU::sub1);
8902
8903 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8904
8905 // Try to legalize the operands in case we need to swap the order to keep it
8906 // valid.
8907 legalizeOperands(*Op1L_Op0H, MDT);
8908 legalizeOperands(*Op1H_Op0L, MDT);
8909 legalizeOperands(*Carry, MDT);
8910 legalizeOperands(*LoHalf, MDT);
8911 legalizeOperands(*Add, MDT);
8912 legalizeOperands(*HiHalf, MDT);
8913
8914 // Move all users of this moved value.
8915 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8916}
8917
8918// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8919// multiplications.
8920void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8921 MachineInstr &Inst,
8922 MachineDominatorTree *MDT) const {
8923 MachineBasicBlock &MBB = *Inst.getParent();
8924 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8925
8926 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8927 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8928 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8929
8930 MachineOperand &Dest = Inst.getOperand(0);
8931 MachineOperand &Src0 = Inst.getOperand(1);
8932 MachineOperand &Src1 = Inst.getOperand(2);
8933 const DebugLoc &DL = Inst.getDebugLoc();
8934 MachineBasicBlock::iterator MII = Inst;
8935
8936 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8937 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8938 const TargetRegisterClass *Src0SubRC =
8939 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8940 if (RI.isSGPRClass(Src0SubRC))
8941 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8942 const TargetRegisterClass *Src1SubRC =
8943 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8944 if (RI.isSGPRClass(Src1SubRC))
8945 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8946
8947 // First, we extract the low 32-bit and high 32-bit values from each of the
8948 // operands.
8949 MachineOperand Op0L =
8950 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8951 MachineOperand Op1L =
8952 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8953
8954 unsigned Opc = Inst.getOpcode();
8955 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8956 ? AMDGPU::V_MUL_HI_U32_e64
8957 : AMDGPU::V_MUL_HI_I32_e64;
8958 MachineInstr *HiHalf =
8959 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8960
8961 MachineInstr *LoHalf =
8962 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8963 .add(Op1L)
8964 .add(Op0L);
8965
8966 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8967 .addReg(DestSub0)
8968 .addImm(AMDGPU::sub0)
8969 .addReg(DestSub1)
8970 .addImm(AMDGPU::sub1);
8971
8972 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8973
8974 // Try to legalize the operands in case we need to swap the order to keep it
8975 // valid.
8976 legalizeOperands(*HiHalf, MDT);
8977 legalizeOperands(*LoHalf, MDT);
8978
8979 // Move all users of this moved value.
8980 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8981}
8982
8983void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8984 MachineInstr &Inst, unsigned Opcode,
8985 MachineDominatorTree *MDT) const {
8986 MachineBasicBlock &MBB = *Inst.getParent();
8987 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8988
8989 MachineOperand &Dest = Inst.getOperand(0);
8990 MachineOperand &Src0 = Inst.getOperand(1);
8991 MachineOperand &Src1 = Inst.getOperand(2);
8992 const DebugLoc &DL = Inst.getDebugLoc();
8993
8994 MachineBasicBlock::iterator MII = Inst;
8995
8996 const MCInstrDesc &InstDesc = get(Opcode);
8997 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8998 MRI.getRegClass(Src0.getReg()) :
8999 &AMDGPU::SGPR_32RegClass;
9000
9001 const TargetRegisterClass *Src0SubRC =
9002 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9003 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9004 MRI.getRegClass(Src1.getReg()) :
9005 &AMDGPU::SGPR_32RegClass;
9006
9007 const TargetRegisterClass *Src1SubRC =
9008 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9009
9010 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9011 AMDGPU::sub0, Src0SubRC);
9012 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9013 AMDGPU::sub0, Src1SubRC);
9014 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9015 AMDGPU::sub1, Src0SubRC);
9016 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9017 AMDGPU::sub1, Src1SubRC);
9018
9019 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9020 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9021 const TargetRegisterClass *NewDestSubRC =
9022 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9023
9024 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9025 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9026 .add(SrcReg0Sub0)
9027 .add(SrcReg1Sub0);
9028
9029 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9030 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9031 .add(SrcReg0Sub1)
9032 .add(SrcReg1Sub1);
9033
9034 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9035 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9036 .addReg(DestSub0)
9037 .addImm(AMDGPU::sub0)
9038 .addReg(DestSub1)
9039 .addImm(AMDGPU::sub1);
9040
9041 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9042
9043 Worklist.insert(&LoHalf);
9044 Worklist.insert(&HiHalf);
9045
9046 // Move all users of this moved value.
9047 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9048}
9049
9050void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9051 MachineInstr &Inst,
9052 MachineDominatorTree *MDT) const {
9053 MachineBasicBlock &MBB = *Inst.getParent();
9054 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9055
9056 MachineOperand &Dest = Inst.getOperand(0);
9057 MachineOperand &Src0 = Inst.getOperand(1);
9058 MachineOperand &Src1 = Inst.getOperand(2);
9059 const DebugLoc &DL = Inst.getDebugLoc();
9060
9061 MachineBasicBlock::iterator MII = Inst;
9062
9063 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9064
9065 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9066
9067 MachineOperand* Op0;
9068 MachineOperand* Op1;
9069
9070 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9071 Op0 = &Src0;
9072 Op1 = &Src1;
9073 } else {
9074 Op0 = &Src1;
9075 Op1 = &Src0;
9076 }
9077
9078 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9079 .add(*Op0);
9080
9081 Register NewDest = MRI.createVirtualRegister(DestRC);
9082
9083 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9084 .addReg(Interm)
9085 .add(*Op1);
9086
9087 MRI.replaceRegWith(Dest.getReg(), NewDest);
9088
9089 Worklist.insert(&Xor);
9090}
9091
9092void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9093 MachineInstr &Inst) const {
9094 MachineBasicBlock &MBB = *Inst.getParent();
9095 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9096
9097 MachineBasicBlock::iterator MII = Inst;
9098 const DebugLoc &DL = Inst.getDebugLoc();
9099
9100 MachineOperand &Dest = Inst.getOperand(0);
9101 MachineOperand &Src = Inst.getOperand(1);
9102
9103 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9104 const TargetRegisterClass *SrcRC = Src.isReg() ?
9105 MRI.getRegClass(Src.getReg()) :
9106 &AMDGPU::SGPR_32RegClass;
9107
9108 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9109 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9110
9111 const TargetRegisterClass *SrcSubRC =
9112 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9113
9114 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9115 AMDGPU::sub0, SrcSubRC);
9116 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9117 AMDGPU::sub1, SrcSubRC);
9118
9119 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9120
9121 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9122
9123 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9124
9125 // We don't need to legalize operands here. src0 for either instruction can be
9126 // an SGPR, and the second input is unused or determined here.
9127 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9128}
9129
9130void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9131 MachineInstr &Inst) const {
9132 MachineBasicBlock &MBB = *Inst.getParent();
9133 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9134 MachineBasicBlock::iterator MII = Inst;
9135 const DebugLoc &DL = Inst.getDebugLoc();
9136
9137 MachineOperand &Dest = Inst.getOperand(0);
9138 uint32_t Imm = Inst.getOperand(2).getImm();
9139 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9140 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9141
9142 (void) Offset;
9143
9144 // Only sext_inreg cases handled.
9145 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9146 Offset == 0 && "Not implemented");
9147
9148 if (BitWidth < 32) {
9149 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9150 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9151 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9152
9153 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9154 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
9155 .addImm(0)
9156 .addImm(BitWidth);
9157
9158 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9159 .addImm(31)
9160 .addReg(MidRegLo);
9161
9162 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9163 .addReg(MidRegLo)
9164 .addImm(AMDGPU::sub0)
9165 .addReg(MidRegHi)
9166 .addImm(AMDGPU::sub1);
9167
9168 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9169 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9170 return;
9171 }
9172
9173 MachineOperand &Src = Inst.getOperand(1);
9174 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9175 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9176
9177 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9178 .addImm(31)
9179 .addReg(Src.getReg(), 0, AMDGPU::sub0);
9180
9181 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9182 .addReg(Src.getReg(), 0, AMDGPU::sub0)
9183 .addImm(AMDGPU::sub0)
9184 .addReg(TmpReg)
9185 .addImm(AMDGPU::sub1);
9186
9187 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9188 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9189}
9190
9191void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9192 MachineInstr &Inst, unsigned Opcode,
9193 MachineDominatorTree *MDT) const {
9194 // (S_FLBIT_I32_B64 hi:lo) ->
9195 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9196 // (S_FF1_I32_B64 hi:lo) ->
9197 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9198
9199 MachineBasicBlock &MBB = *Inst.getParent();
9200 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9201 MachineBasicBlock::iterator MII = Inst;
9202 const DebugLoc &DL = Inst.getDebugLoc();
9203
9204 MachineOperand &Dest = Inst.getOperand(0);
9205 MachineOperand &Src = Inst.getOperand(1);
9206
9207 const MCInstrDesc &InstDesc = get(Opcode);
9208
9209 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9210 unsigned OpcodeAdd =
9211 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
9212
9213 const TargetRegisterClass *SrcRC =
9214 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9215 const TargetRegisterClass *SrcSubRC =
9216 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9217
9218 MachineOperand SrcRegSub0 =
9219 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9220 MachineOperand SrcRegSub1 =
9221 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9222
9223 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9224 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9225 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9226 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9227
9228 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9229
9230 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9231
9232 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9233 .addReg(IsCtlz ? MidReg1 : MidReg2)
9234 .addImm(32)
9235 .addImm(1); // enable clamp
9236
9237 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9238 .addReg(MidReg3)
9239 .addReg(IsCtlz ? MidReg2 : MidReg1);
9240
9241 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9242
9243 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9244}
9245
9246void SIInstrInfo::addUsersToMoveToVALUWorklist(
9248 SIInstrWorklist &Worklist) const {
9249 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9250 MachineInstr &UseMI = *MO.getParent();
9251
9252 unsigned OpNo = 0;
9253
9254 switch (UseMI.getOpcode()) {
9255 case AMDGPU::COPY:
9256 case AMDGPU::WQM:
9257 case AMDGPU::SOFT_WQM:
9258 case AMDGPU::STRICT_WWM:
9259 case AMDGPU::STRICT_WQM:
9260 case AMDGPU::REG_SEQUENCE:
9261 case AMDGPU::PHI:
9262 case AMDGPU::INSERT_SUBREG:
9263 break;
9264 default:
9265 OpNo = MO.getOperandNo();
9266 break;
9267 }
9268
9269 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9270 MRI.constrainRegClass(DstReg, OpRC);
9271
9272 if (!RI.hasVectorRegisters(OpRC))
9273 Worklist.insert(&UseMI);
9274 else
9275 // Legalization could change user list.
9277 }
9278}
9279
9280void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9282 MachineInstr &Inst) const {
9283 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9284 MachineBasicBlock *MBB = Inst.getParent();
9285 MachineOperand &Src0 = Inst.getOperand(1);
9286 MachineOperand &Src1 = Inst.getOperand(2);
9287 const DebugLoc &DL = Inst.getDebugLoc();
9288
9289 if (ST.useRealTrue16Insts()) {
9290 Register SrcReg0, SrcReg1;
9291 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9292 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9293 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
9294 } else {
9295 SrcReg0 = Src0.getReg();
9296 }
9297
9298 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9299 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9300 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
9301 } else {
9302 SrcReg1 = Src1.getReg();
9303 }
9304
9305 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9306 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9307
9308 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9309 switch (Inst.getOpcode()) {
9310 case AMDGPU::S_PACK_LL_B32_B16:
9311 NewMI
9312 .addReg(SrcReg0, 0,
9313 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9314 .addImm(AMDGPU::lo16)
9315 .addReg(SrcReg1, 0,
9316 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9317 .addImm(AMDGPU::hi16);
9318 break;
9319 case AMDGPU::S_PACK_LH_B32_B16:
9320 NewMI
9321 .addReg(SrcReg0, 0,
9322 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9323 .addImm(AMDGPU::lo16)
9324 .addReg(SrcReg1, 0, AMDGPU::hi16)
9325 .addImm(AMDGPU::hi16);
9326 break;
9327 case AMDGPU::S_PACK_HL_B32_B16:
9328 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9329 .addImm(AMDGPU::lo16)
9330 .addReg(SrcReg1, 0,
9331 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9332 .addImm(AMDGPU::hi16);
9333 break;
9334 case AMDGPU::S_PACK_HH_B32_B16:
9335 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9336 .addImm(AMDGPU::lo16)
9337 .addReg(SrcReg1, 0, AMDGPU::hi16)
9338 .addImm(AMDGPU::hi16);
9339 break;
9340 default:
9341 llvm_unreachable("unhandled s_pack_* instruction");
9342 }
9343
9344 MachineOperand &Dest = Inst.getOperand(0);
9345 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9346 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9347 return;
9348 }
9349
9350 switch (Inst.getOpcode()) {
9351 case AMDGPU::S_PACK_LL_B32_B16: {
9352 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9353 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9354
9355 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9356 // 0.
9357 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9358 .addImm(0xffff);
9359
9360 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9361 .addReg(ImmReg, RegState::Kill)
9362 .add(Src0);
9363
9364 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9365 .add(Src1)
9366 .addImm(16)
9367 .addReg(TmpReg, RegState::Kill);
9368 break;
9369 }
9370 case AMDGPU::S_PACK_LH_B32_B16: {
9371 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9372 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9373 .addImm(0xffff);
9374 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9375 .addReg(ImmReg, RegState::Kill)
9376 .add(Src0)
9377 .add(Src1);
9378 break;
9379 }
9380 case AMDGPU::S_PACK_HL_B32_B16: {
9381 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9382 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9383 .addImm(16)
9384 .add(Src0);
9385 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9386 .add(Src1)
9387 .addImm(16)
9388 .addReg(TmpReg, RegState::Kill);
9389 break;
9390 }
9391 case AMDGPU::S_PACK_HH_B32_B16: {
9392 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9393 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9394 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9395 .addImm(16)
9396 .add(Src0);
9397 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9398 .addImm(0xffff0000);
9399 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9400 .add(Src1)
9401 .addReg(ImmReg, RegState::Kill)
9402 .addReg(TmpReg, RegState::Kill);
9403 break;
9404 }
9405 default:
9406 llvm_unreachable("unhandled s_pack_* instruction");
9407 }
9408
9409 MachineOperand &Dest = Inst.getOperand(0);
9410 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9411 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9412}
9413
9414void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9415 MachineInstr &SCCDefInst,
9416 SIInstrWorklist &Worklist,
9417 Register NewCond) const {
9418
9419 // Ensure that def inst defines SCC, which is still live.
9420 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9421 !Op.isDead() && Op.getParent() == &SCCDefInst);
9422 SmallVector<MachineInstr *, 4> CopyToDelete;
9423 // This assumes that all the users of SCC are in the same block
9424 // as the SCC def.
9425 for (MachineInstr &MI : // Skip the def inst itself.
9426 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9427 SCCDefInst.getParent()->end())) {
9428 // Check if SCC is used first.
9429 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9430 if (SCCIdx != -1) {
9431 if (MI.isCopy()) {
9432 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9433 Register DestReg = MI.getOperand(0).getReg();
9434
9435 MRI.replaceRegWith(DestReg, NewCond);
9436 CopyToDelete.push_back(&MI);
9437 } else {
9438
9439 if (NewCond.isValid())
9440 MI.getOperand(SCCIdx).setReg(NewCond);
9441
9442 Worklist.insert(&MI);
9443 }
9444 }
9445 // Exit if we find another SCC def.
9446 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9447 break;
9448 }
9449 for (auto &Copy : CopyToDelete)
9450 Copy->eraseFromParent();
9451}
9452
9453// Instructions that use SCC may be converted to VALU instructions. When that
9454// happens, the SCC register is changed to VCC_LO. The instruction that defines
9455// SCC must be changed to an instruction that defines VCC. This function makes
9456// sure that the instruction that defines SCC is added to the moveToVALU
9457// worklist.
9458void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9459 SIInstrWorklist &Worklist) const {
9460 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9461 // then there is nothing to do because the defining instruction has been
9462 // converted to a VALU already. If SCC then that instruction needs to be
9463 // converted to a VALU.
9464 for (MachineInstr &MI :
9465 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9466 SCCUseInst->getParent()->rend())) {
9467 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9468 break;
9469 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9470 Worklist.insert(&MI);
9471 break;
9472 }
9473 }
9474}
9475
9476const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9477 const MachineInstr &Inst) const {
9478 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9479
9480 switch (Inst.getOpcode()) {
9481 // For target instructions, getOpRegClass just returns the virtual register
9482 // class associated with the operand, so we need to find an equivalent VGPR
9483 // register class in order to move the instruction to the VALU.
9484 case AMDGPU::COPY:
9485 case AMDGPU::PHI:
9486 case AMDGPU::REG_SEQUENCE:
9487 case AMDGPU::INSERT_SUBREG:
9488 case AMDGPU::WQM:
9489 case AMDGPU::SOFT_WQM:
9490 case AMDGPU::STRICT_WWM:
9491 case AMDGPU::STRICT_WQM: {
9492 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9493 if (RI.isAGPRClass(SrcRC)) {
9494 if (RI.isAGPRClass(NewDstRC))
9495 return nullptr;
9496
9497 switch (Inst.getOpcode()) {
9498 case AMDGPU::PHI:
9499 case AMDGPU::REG_SEQUENCE:
9500 case AMDGPU::INSERT_SUBREG:
9501 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9502 break;
9503 default:
9504 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9505 }
9506
9507 if (!NewDstRC)
9508 return nullptr;
9509 } else {
9510 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9511 return nullptr;
9512
9513 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9514 if (!NewDstRC)
9515 return nullptr;
9516 }
9517
9518 return NewDstRC;
9519 }
9520 default:
9521 return NewDstRC;
9522 }
9523}
9524
9525// Find the one SGPR operand we are allowed to use.
9526Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9527 int OpIndices[3]) const {
9528 const MCInstrDesc &Desc = MI.getDesc();
9529
9530 // Find the one SGPR operand we are allowed to use.
9531 //
9532 // First we need to consider the instruction's operand requirements before
9533 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9534 // of VCC, but we are still bound by the constant bus requirement to only use
9535 // one.
9536 //
9537 // If the operand's class is an SGPR, we can never move it.
9538
9539 Register SGPRReg = findImplicitSGPRRead(MI);
9540 if (SGPRReg)
9541 return SGPRReg;
9542
9543 Register UsedSGPRs[3] = {Register()};
9544 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9545
9546 for (unsigned i = 0; i < 3; ++i) {
9547 int Idx = OpIndices[i];
9548 if (Idx == -1)
9549 break;
9550
9551 const MachineOperand &MO = MI.getOperand(Idx);
9552 if (!MO.isReg())
9553 continue;
9554
9555 // Is this operand statically required to be an SGPR based on the operand
9556 // constraints?
9557 const TargetRegisterClass *OpRC =
9558 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9559 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9560 if (IsRequiredSGPR)
9561 return MO.getReg();
9562
9563 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9564 Register Reg = MO.getReg();
9565 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9566 if (RI.isSGPRClass(RegRC))
9567 UsedSGPRs[i] = Reg;
9568 }
9569
9570 // We don't have a required SGPR operand, so we have a bit more freedom in
9571 // selecting operands to move.
9572
9573 // Try to select the most used SGPR. If an SGPR is equal to one of the
9574 // others, we choose that.
9575 //
9576 // e.g.
9577 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9578 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9579
9580 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9581 // prefer those.
9582
9583 if (UsedSGPRs[0]) {
9584 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9585 SGPRReg = UsedSGPRs[0];
9586 }
9587
9588 if (!SGPRReg && UsedSGPRs[1]) {
9589 if (UsedSGPRs[1] == UsedSGPRs[2])
9590 SGPRReg = UsedSGPRs[1];
9591 }
9592
9593 return SGPRReg;
9594}
9595
9597 AMDGPU::OpName OperandName) const {
9598 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9599 return nullptr;
9600
9601 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9602 if (Idx == -1)
9603 return nullptr;
9604
9605 return &MI.getOperand(Idx);
9606}
9607
9609 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9610 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9613 return (Format << 44) |
9614 (1ULL << 56) | // RESOURCE_LEVEL = 1
9615 (3ULL << 60); // OOB_SELECT = 3
9616 }
9617
9618 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9619 if (ST.isAmdHsaOS()) {
9620 // Set ATC = 1. GFX9 doesn't have this bit.
9621 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9622 RsrcDataFormat |= (1ULL << 56);
9623
9624 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9625 // BTW, it disables TC L2 and therefore decreases performance.
9626 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9627 RsrcDataFormat |= (2ULL << 59);
9628 }
9629
9630 return RsrcDataFormat;
9631}
9632
9636 0xffffffff; // Size;
9637
9638 // GFX9 doesn't have ELEMENT_SIZE.
9639 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9640 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9641 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9642 }
9643
9644 // IndexStride = 64 / 32.
9645 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9646 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9647
9648 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9649 // Clear them unless we want a huge stride.
9650 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9651 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9652 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9653
9654 return Rsrc23;
9655}
9656
9658 unsigned Opc = MI.getOpcode();
9659
9660 return isSMRD(Opc);
9661}
9662
9664 return get(Opc).mayLoad() &&
9665 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9666}
9667
9669 int &FrameIndex) const {
9670 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9671 if (!Addr || !Addr->isFI())
9672 return Register();
9673
9674 assert(!MI.memoperands_empty() &&
9675 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9676
9677 FrameIndex = Addr->getIndex();
9678 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9679}
9680
9682 int &FrameIndex) const {
9683 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9684 assert(Addr && Addr->isFI());
9685 FrameIndex = Addr->getIndex();
9686 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9687}
9688
9690 int &FrameIndex) const {
9691 if (!MI.mayLoad())
9692 return Register();
9693
9694 if (isMUBUF(MI) || isVGPRSpill(MI))
9695 return isStackAccess(MI, FrameIndex);
9696
9697 if (isSGPRSpill(MI))
9698 return isSGPRStackAccess(MI, FrameIndex);
9699
9700 return Register();
9701}
9702
9704 int &FrameIndex) const {
9705 if (!MI.mayStore())
9706 return Register();
9707
9708 if (isMUBUF(MI) || isVGPRSpill(MI))
9709 return isStackAccess(MI, FrameIndex);
9710
9711 if (isSGPRSpill(MI))
9712 return isSGPRStackAccess(MI, FrameIndex);
9713
9714 return Register();
9715}
9716
9718 unsigned Size = 0;
9720 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9721 while (++I != E && I->isInsideBundle()) {
9722 assert(!I->isBundle() && "No nested bundle!");
9724 }
9725
9726 return Size;
9727}
9728
9730 unsigned Opc = MI.getOpcode();
9732 unsigned DescSize = Desc.getSize();
9733
9734 // If we have a definitive size, we can use it. Otherwise we need to inspect
9735 // the operands to know the size.
9736 if (isFixedSize(MI)) {
9737 unsigned Size = DescSize;
9738
9739 // If we hit the buggy offset, an extra nop will be inserted in MC so
9740 // estimate the worst case.
9741 if (MI.isBranch() && ST.hasOffset3fBug())
9742 Size += 4;
9743
9744 return Size;
9745 }
9746
9747 // Instructions may have a 32-bit literal encoded after them. Check
9748 // operands that could ever be literals.
9749 if (isVALU(MI) || isSALU(MI)) {
9750 if (isDPP(MI))
9751 return DescSize;
9752 bool HasLiteral = false;
9753 unsigned LiteralSize = 4;
9754 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9755 const MachineOperand &Op = MI.getOperand(I);
9756 const MCOperandInfo &OpInfo = Desc.operands()[I];
9757 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9758 HasLiteral = true;
9759 if (ST.has64BitLiterals()) {
9760 switch (OpInfo.OperandType) {
9761 default:
9762 break;
9764 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9765 LiteralSize = 8;
9766 break;
9768 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9769 LiteralSize = 8;
9770 break;
9771 }
9772 }
9773 break;
9774 }
9775 }
9776 return HasLiteral ? DescSize + LiteralSize : DescSize;
9777 }
9778
9779 // Check whether we have extra NSA words.
9780 if (isMIMG(MI)) {
9781 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9782 if (VAddr0Idx < 0)
9783 return 8;
9784
9785 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9786 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9787 }
9788
9789 switch (Opc) {
9790 case TargetOpcode::BUNDLE:
9791 return getInstBundleSize(MI);
9792 case TargetOpcode::INLINEASM:
9793 case TargetOpcode::INLINEASM_BR: {
9794 const MachineFunction *MF = MI.getMF();
9795 const char *AsmStr = MI.getOperand(0).getSymbolName();
9796 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9797 }
9798 default:
9799 if (MI.isMetaInstruction())
9800 return 0;
9801
9802 // If D16 Pseudo inst, get correct MC code size
9803 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9804 if (D16Info) {
9805 // Assume d16_lo/hi inst are always in same size
9806 unsigned LoInstOpcode = D16Info->LoOp;
9807 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9808 DescSize = Desc.getSize();
9809 }
9810
9811 // If FMA Pseudo inst, get correct MC code size
9812 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9813 // All potential lowerings are the same size; arbitrarily pick one.
9814 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9815 DescSize = Desc.getSize();
9816 }
9817
9818 return DescSize;
9819 }
9820}
9821
9823 if (!isFLAT(MI))
9824 return false;
9825
9826 if (MI.memoperands_empty())
9827 return true;
9828
9829 for (const MachineMemOperand *MMO : MI.memoperands()) {
9830 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9831 return true;
9832 }
9833 return false;
9834}
9835
9838 static const std::pair<int, const char *> TargetIndices[] = {
9839 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9840 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9841 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9842 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9843 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9844 return ArrayRef(TargetIndices);
9845}
9846
9847/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9848/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9854
9855/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9856/// pass.
9861
9862// Called during:
9863// - pre-RA scheduling and post-RA scheduling
9866 const ScheduleDAGMI *DAG) const {
9867 // Borrowed from Arm Target
9868 // We would like to restrict this hazard recognizer to only
9869 // post-RA scheduling; we can tell that we're post-RA because we don't
9870 // track VRegLiveness.
9871 if (!DAG->hasVRegLiveness())
9872 return new GCNHazardRecognizer(DAG->MF);
9874}
9875
9876std::pair<unsigned, unsigned>
9878 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9879}
9880
9883 static const std::pair<unsigned, const char *> TargetFlags[] = {
9884 {MO_GOTPCREL, "amdgpu-gotprel"},
9885 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9886 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9887 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9888 {MO_REL32_LO, "amdgpu-rel32-lo"},
9889 {MO_REL32_HI, "amdgpu-rel32-hi"},
9890 {MO_REL64, "amdgpu-rel64"},
9891 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9892 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9893 {MO_ABS64, "amdgpu-abs64"},
9894 };
9895
9896 return ArrayRef(TargetFlags);
9897}
9898
9901 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9902 {
9903 {MONoClobber, "amdgpu-noclobber"},
9904 {MOLastUse, "amdgpu-last-use"},
9905 {MOCooperative, "amdgpu-cooperative"},
9906 };
9907
9908 return ArrayRef(TargetFlags);
9909}
9910
9912 const MachineFunction &MF) const {
9914 assert(SrcReg.isVirtual());
9915 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9916 return AMDGPU::WWM_COPY;
9917
9918 return AMDGPU::COPY;
9919}
9920
9922 uint16_t Opcode = MI.getOpcode();
9923 // Check if it is SGPR spill or wwm-register spill Opcode.
9924 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
9925 return true;
9926
9927 const MachineFunction *MF = MI.getMF();
9928 const MachineRegisterInfo &MRI = MF->getRegInfo();
9930
9931 // See if this is Liverange split instruction inserted for SGPR or
9932 // wwm-register. The implicit def inserted for wwm-registers should also be
9933 // included as they can appear at the bb begin.
9934 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
9935 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
9936 return false;
9937
9938 Register Reg = MI.getOperand(0).getReg();
9939 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
9940 return IsLRSplitInst;
9941
9942 return MFI->isWWMReg(Reg);
9943}
9944
9946 Register Reg) const {
9947 // We need to handle instructions which may be inserted during register
9948 // allocation to handle the prolog. The initial prolog instruction may have
9949 // been separated from the start of the block by spills and copies inserted
9950 // needed by the prolog. However, the insertions for scalar registers can
9951 // always be placed at the BB top as they are independent of the exec mask
9952 // value.
9953 bool IsNullOrVectorRegister = true;
9954 if (Reg) {
9955 const MachineFunction *MF = MI.getMF();
9956 const MachineRegisterInfo &MRI = MF->getRegInfo();
9957 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9958 }
9959
9960 return IsNullOrVectorRegister &&
9961 (canAddToBBProlog(MI) ||
9962 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
9963 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9964}
9965
9969 const DebugLoc &DL,
9970 Register DestReg) const {
9971 if (ST.hasAddNoCarry())
9972 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9973
9974 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9975 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9976 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9977
9978 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9979 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9980}
9981
9984 const DebugLoc &DL,
9985 Register DestReg,
9986 RegScavenger &RS) const {
9987 if (ST.hasAddNoCarry())
9988 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9989
9990 // If available, prefer to use vcc.
9991 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9992 ? Register(RI.getVCC())
9993 : RS.scavengeRegisterBackwards(
9994 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9995 0, /* AllowSpill */ false);
9996
9997 // TODO: Users need to deal with this.
9998 if (!UnusedCarry.isValid())
9999 return MachineInstrBuilder();
10000
10001 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10002 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10003}
10004
10005bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10006 switch (Opcode) {
10007 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10008 case AMDGPU::SI_KILL_I1_TERMINATOR:
10009 return true;
10010 default:
10011 return false;
10012 }
10013}
10014
10016 switch (Opcode) {
10017 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10018 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10019 case AMDGPU::SI_KILL_I1_PSEUDO:
10020 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10021 default:
10022 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10023 }
10024}
10025
10026bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10027 return Imm <= getMaxMUBUFImmOffset(ST);
10028}
10029
10031 // GFX12 field is non-negative 24-bit signed byte offset.
10032 const unsigned OffsetBits =
10033 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10034 return (1 << OffsetBits) - 1;
10035}
10036
10038 if (!ST.isWave32())
10039 return;
10040
10041 if (MI.isInlineAsm())
10042 return;
10043
10044 for (auto &Op : MI.implicit_operands()) {
10045 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10046 Op.setReg(AMDGPU::VCC_LO);
10047 }
10048}
10049
10051 if (!isSMRD(MI))
10052 return false;
10053
10054 // Check that it is using a buffer resource.
10055 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10056 if (Idx == -1) // e.g. s_memtime
10057 return false;
10058
10059 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10060 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10061}
10062
10063// Given Imm, split it into the values to put into the SOffset and ImmOffset
10064// fields in an MUBUF instruction. Return false if it is not possible (due to a
10065// hardware bug needing a workaround).
10066//
10067// The required alignment ensures that individual address components remain
10068// aligned if they are aligned to begin with. It also ensures that additional
10069// offsets within the given alignment can be added to the resulting ImmOffset.
10071 uint32_t &ImmOffset, Align Alignment) const {
10072 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10073 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10074 uint32_t Overflow = 0;
10075
10076 if (Imm > MaxImm) {
10077 if (Imm <= MaxImm + 64) {
10078 // Use an SOffset inline constant for 4..64
10079 Overflow = Imm - MaxImm;
10080 Imm = MaxImm;
10081 } else {
10082 // Try to keep the same value in SOffset for adjacent loads, so that
10083 // the corresponding register contents can be re-used.
10084 //
10085 // Load values with all low-bits (except for alignment bits) set into
10086 // SOffset, so that a larger range of values can be covered using
10087 // s_movk_i32.
10088 //
10089 // Atomic operations fail to work correctly when individual address
10090 // components are unaligned, even if their sum is aligned.
10091 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10092 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10093 Imm = Low;
10094 Overflow = High - Alignment.value();
10095 }
10096 }
10097
10098 if (Overflow > 0) {
10099 // There is a hardware bug in SI and CI which prevents address clamping in
10100 // MUBUF instructions from working correctly with SOffsets. The immediate
10101 // offset is unaffected.
10102 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10103 return false;
10104
10105 // It is not possible to set immediate in SOffset field on some targets.
10106 if (ST.hasRestrictedSOffset())
10107 return false;
10108 }
10109
10110 ImmOffset = Imm;
10111 SOffset = Overflow;
10112 return true;
10113}
10114
10115// Depending on the used address space and instructions, some immediate offsets
10116// are allowed and some are not.
10117// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10118// scratch instruction offsets can also be negative. On GFX12, offsets can be
10119// negative for all variants.
10120//
10121// There are several bugs related to these offsets:
10122// On gfx10.1, flat instructions that go into the global address space cannot
10123// use an offset.
10124//
10125// For scratch instructions, the address can be either an SGPR or a VGPR.
10126// The following offsets can be used, depending on the architecture (x means
10127// cannot be used):
10128// +----------------------------+------+------+
10129// | Address-Mode | SGPR | VGPR |
10130// +----------------------------+------+------+
10131// | gfx9 | | |
10132// | negative, 4-aligned offset | x | ok |
10133// | negative, unaligned offset | x | ok |
10134// +----------------------------+------+------+
10135// | gfx10 | | |
10136// | negative, 4-aligned offset | ok | ok |
10137// | negative, unaligned offset | ok | x |
10138// +----------------------------+------+------+
10139// | gfx10.3 | | |
10140// | negative, 4-aligned offset | ok | ok |
10141// | negative, unaligned offset | ok | ok |
10142// +----------------------------+------+------+
10143//
10144// This function ignores the addressing mode, so if an offset cannot be used in
10145// one addressing mode, it is considered illegal.
10146bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10147 uint64_t FlatVariant) const {
10148 // TODO: Should 0 be special cased?
10149 if (!ST.hasFlatInstOffsets())
10150 return false;
10151
10152 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10153 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10154 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10155 return false;
10156
10157 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10158 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10159 (Offset % 4) != 0) {
10160 return false;
10161 }
10162
10163 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10164 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10165 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10166}
10167
10168// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10169std::pair<int64_t, int64_t>
10170SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10171 uint64_t FlatVariant) const {
10172 int64_t RemainderOffset = COffsetVal;
10173 int64_t ImmField = 0;
10174
10175 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10176 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10177
10178 if (AllowNegative) {
10179 // Use signed division by a power of two to truncate towards 0.
10180 int64_t D = 1LL << NumBits;
10181 RemainderOffset = (COffsetVal / D) * D;
10182 ImmField = COffsetVal - RemainderOffset;
10183
10184 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10185 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10186 (ImmField % 4) != 0) {
10187 // Make ImmField a multiple of 4
10188 RemainderOffset += ImmField % 4;
10189 ImmField -= ImmField % 4;
10190 }
10191 } else if (COffsetVal >= 0) {
10192 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10193 RemainderOffset = COffsetVal - ImmField;
10194 }
10195
10196 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10197 assert(RemainderOffset + ImmField == COffsetVal);
10198 return {ImmField, RemainderOffset};
10199}
10200
10202 if (ST.hasNegativeScratchOffsetBug() &&
10203 FlatVariant == SIInstrFlags::FlatScratch)
10204 return false;
10205
10206 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10207}
10208
10209static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10210 switch (ST.getGeneration()) {
10211 default:
10212 break;
10215 return SIEncodingFamily::SI;
10218 return SIEncodingFamily::VI;
10224 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10226 }
10227 llvm_unreachable("Unknown subtarget generation!");
10228}
10229
10230bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10231 switch(MCOp) {
10232 // These opcodes use indirect register addressing so
10233 // they need special handling by codegen (currently missing).
10234 // Therefore it is too risky to allow these opcodes
10235 // to be selected by dpp combiner or sdwa peepholer.
10236 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10237 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10238 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10239 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10240 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10241 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10242 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10243 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10244 return true;
10245 default:
10246 return false;
10247 }
10248}
10249
10250#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10251 case OPCODE##_dpp: \
10252 case OPCODE##_e32: \
10253 case OPCODE##_e64: \
10254 case OPCODE##_e64_dpp: \
10255 case OPCODE##_sdwa:
10256
10257static bool isRenamedInGFX9(int Opcode) {
10258 switch (Opcode) {
10259 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10260 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10261 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10262 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10263 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10264 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10265 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10266 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10267 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10268 //
10269 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10270 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10271 case AMDGPU::V_FMA_F16_gfx9_e64:
10272 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10273 case AMDGPU::V_INTERP_P2_F16:
10274 case AMDGPU::V_MAD_F16_e64:
10275 case AMDGPU::V_MAD_U16_e64:
10276 case AMDGPU::V_MAD_I16_e64:
10277 return true;
10278 default:
10279 return false;
10280 }
10281}
10282
10283int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10284 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10285 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10286
10287 unsigned Gen = subtargetEncodingFamily(ST);
10288
10289 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10291
10292 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10293 // subtarget has UnpackedD16VMem feature.
10294 // TODO: remove this when we discard GFX80 encoding.
10295 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10297
10298 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10299 switch (ST.getGeneration()) {
10300 default:
10302 break;
10305 break;
10308 break;
10309 }
10310 }
10311
10312 if (isMAI(Opcode)) {
10313 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10314 if (MFMAOp != -1)
10315 Opcode = MFMAOp;
10316 }
10317
10318 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10319
10320 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10322
10323 // -1 means that Opcode is already a native instruction.
10324 if (MCOp == -1)
10325 return Opcode;
10326
10327 if (ST.hasGFX90AInsts()) {
10328 uint16_t NMCOp = (uint16_t)-1;
10329 if (ST.hasGFX940Insts())
10331 if (NMCOp == (uint16_t)-1)
10333 if (NMCOp == (uint16_t)-1)
10335 if (NMCOp != (uint16_t)-1)
10336 MCOp = NMCOp;
10337 }
10338
10339 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10340 // no encoding in the given subtarget generation.
10341 if (MCOp == (uint16_t)-1)
10342 return -1;
10343
10344 if (isAsmOnlyOpcode(MCOp))
10345 return -1;
10346
10347 return MCOp;
10348}
10349
10350static
10352 assert(RegOpnd.isReg());
10353 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10354 getRegSubRegPair(RegOpnd);
10355}
10356
10359 assert(MI.isRegSequence());
10360 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10361 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10362 auto &RegOp = MI.getOperand(1 + 2 * I);
10363 return getRegOrUndef(RegOp);
10364 }
10366}
10367
10368// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10369// Following a subreg of reg:subreg isn't supported
10372 if (!RSR.SubReg)
10373 return false;
10374 switch (MI.getOpcode()) {
10375 default: break;
10376 case AMDGPU::REG_SEQUENCE:
10377 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10378 return true;
10379 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10380 case AMDGPU::INSERT_SUBREG:
10381 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10382 // inserted the subreg we're looking for
10383 RSR = getRegOrUndef(MI.getOperand(2));
10384 else { // the subreg in the rest of the reg
10385 auto R1 = getRegOrUndef(MI.getOperand(1));
10386 if (R1.SubReg) // subreg of subreg isn't supported
10387 return false;
10388 RSR.Reg = R1.Reg;
10389 }
10390 return true;
10391 }
10392 return false;
10393}
10394
10396 const MachineRegisterInfo &MRI) {
10397 assert(MRI.isSSA());
10398 if (!P.Reg.isVirtual())
10399 return nullptr;
10400
10401 auto RSR = P;
10402 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10403 while (auto *MI = DefInst) {
10404 DefInst = nullptr;
10405 switch (MI->getOpcode()) {
10406 case AMDGPU::COPY:
10407 case AMDGPU::V_MOV_B32_e32: {
10408 auto &Op1 = MI->getOperand(1);
10409 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10410 if (Op1.isUndef())
10411 return nullptr;
10412 RSR = getRegSubRegPair(Op1);
10413 DefInst = MRI.getVRegDef(RSR.Reg);
10414 }
10415 break;
10416 }
10417 default:
10418 if (followSubRegDef(*MI, RSR)) {
10419 if (!RSR.Reg)
10420 return nullptr;
10421 DefInst = MRI.getVRegDef(RSR.Reg);
10422 }
10423 }
10424 if (!DefInst)
10425 return MI;
10426 }
10427 return nullptr;
10428}
10429
10431 Register VReg,
10432 const MachineInstr &DefMI,
10433 const MachineInstr &UseMI) {
10434 assert(MRI.isSSA() && "Must be run on SSA");
10435
10436 auto *TRI = MRI.getTargetRegisterInfo();
10437 auto *DefBB = DefMI.getParent();
10438
10439 // Don't bother searching between blocks, although it is possible this block
10440 // doesn't modify exec.
10441 if (UseMI.getParent() != DefBB)
10442 return true;
10443
10444 const int MaxInstScan = 20;
10445 int NumInst = 0;
10446
10447 // Stop scan at the use.
10448 auto E = UseMI.getIterator();
10449 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10450 if (I->isDebugInstr())
10451 continue;
10452
10453 if (++NumInst > MaxInstScan)
10454 return true;
10455
10456 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10457 return true;
10458 }
10459
10460 return false;
10461}
10462
10464 Register VReg,
10465 const MachineInstr &DefMI) {
10466 assert(MRI.isSSA() && "Must be run on SSA");
10467
10468 auto *TRI = MRI.getTargetRegisterInfo();
10469 auto *DefBB = DefMI.getParent();
10470
10471 const int MaxUseScan = 10;
10472 int NumUse = 0;
10473
10474 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10475 auto &UseInst = *Use.getParent();
10476 // Don't bother searching between blocks, although it is possible this block
10477 // doesn't modify exec.
10478 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10479 return true;
10480
10481 if (++NumUse > MaxUseScan)
10482 return true;
10483 }
10484
10485 if (NumUse == 0)
10486 return false;
10487
10488 const int MaxInstScan = 20;
10489 int NumInst = 0;
10490
10491 // Stop scan when we have seen all the uses.
10492 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10493 assert(I != DefBB->end());
10494
10495 if (I->isDebugInstr())
10496 continue;
10497
10498 if (++NumInst > MaxInstScan)
10499 return true;
10500
10501 for (const MachineOperand &Op : I->operands()) {
10502 // We don't check reg masks here as they're used only on calls:
10503 // 1. EXEC is only considered const within one BB
10504 // 2. Call should be a terminator instruction if present in a BB
10505
10506 if (!Op.isReg())
10507 continue;
10508
10509 Register Reg = Op.getReg();
10510 if (Op.isUse()) {
10511 if (Reg == VReg && --NumUse == 0)
10512 return false;
10513 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10514 return true;
10515 }
10516 }
10517}
10518
10521 const DebugLoc &DL, Register Src, Register Dst) const {
10522 auto Cur = MBB.begin();
10523 if (Cur != MBB.end())
10524 do {
10525 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10526 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10527 ++Cur;
10528 } while (Cur != MBB.end() && Cur != LastPHIIt);
10529
10530 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10531 Dst);
10532}
10533
10536 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10537 if (InsPt != MBB.end() &&
10538 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10539 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10540 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10541 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10542 InsPt++;
10543 return BuildMI(MBB, InsPt, DL,
10544 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10545 .addReg(Src, 0, SrcSubReg)
10546 .addReg(AMDGPU::EXEC, RegState::Implicit);
10547 }
10548 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10549 Dst);
10550}
10551
10552bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10553
10556 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10557 VirtRegMap *VRM) const {
10558 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10559 //
10560 // %0:sreg_32 = COPY $m0
10561 //
10562 // We explicitly chose SReg_32 for the virtual register so such a copy might
10563 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10564 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10565 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10566 // TargetInstrInfo::foldMemoryOperand() is going to try.
10567 // A similar issue also exists with spilling and reloading $exec registers.
10568 //
10569 // To prevent that, constrain the %0 register class here.
10570 if (isFullCopyInstr(MI)) {
10571 Register DstReg = MI.getOperand(0).getReg();
10572 Register SrcReg = MI.getOperand(1).getReg();
10573 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10574 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10576 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10577 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10578 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10579 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10580 return nullptr;
10581 }
10582 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10583 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10584 return nullptr;
10585 }
10586 }
10587 }
10588
10589 return nullptr;
10590}
10591
10593 const MachineInstr &MI,
10594 unsigned *PredCost) const {
10595 if (MI.isBundle()) {
10597 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10598 unsigned Lat = 0, Count = 0;
10599 for (++I; I != E && I->isBundledWithPred(); ++I) {
10600 ++Count;
10601 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10602 }
10603 return Lat + Count - 1;
10604 }
10605
10606 return SchedModel.computeInstrLatency(&MI);
10607}
10608
10609const MachineOperand &
10611 if (const MachineOperand *CallAddrOp =
10612 getNamedOperand(MI, AMDGPU::OpName::src0))
10613 return *CallAddrOp;
10615}
10616
10619 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10620 unsigned Opcode = MI.getOpcode();
10621
10622 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10623 Register Dst = MI.getOperand(0).getReg();
10624 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10625 : MI.getOperand(1).getReg();
10626 LLT DstTy = MRI.getType(Dst);
10627 LLT SrcTy = MRI.getType(Src);
10628 unsigned DstAS = DstTy.getAddressSpace();
10629 unsigned SrcAS = SrcTy.getAddressSpace();
10630 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10631 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10632 ST.hasGloballyAddressableScratch()
10635 };
10636
10637 // If the target supports globally addressable scratch, the mapping from
10638 // scratch memory to the flat aperture changes therefore an address space cast
10639 // is no longer uniform.
10640 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10641 return HandleAddrSpaceCast(MI);
10642
10643 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10644 auto IID = GI->getIntrinsicID();
10649
10650 switch (IID) {
10651 case Intrinsic::amdgcn_addrspacecast_nonnull:
10652 return HandleAddrSpaceCast(MI);
10653 case Intrinsic::amdgcn_if:
10654 case Intrinsic::amdgcn_else:
10655 // FIXME: Uniform if second result
10656 break;
10657 }
10658
10660 }
10661
10662 // Loads from the private and flat address spaces are divergent, because
10663 // threads can execute the load instruction with the same inputs and get
10664 // different results.
10665 //
10666 // All other loads are not divergent, because if threads issue loads with the
10667 // same arguments, they will always get the same result.
10668 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10669 Opcode == AMDGPU::G_SEXTLOAD) {
10670 if (MI.memoperands_empty())
10671 return InstructionUniformity::NeverUniform; // conservative assumption
10672
10673 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10674 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10675 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10676 })) {
10677 // At least one MMO in a non-global address space.
10679 }
10681 }
10682
10683 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10684 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10685 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10686 AMDGPU::isGenericAtomic(Opcode)) {
10688 }
10690}
10691
10694
10695 if (isNeverUniform(MI))
10697
10698 unsigned opcode = MI.getOpcode();
10699 if (opcode == AMDGPU::V_READLANE_B32 ||
10700 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10701 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10703
10704 if (isCopyInstr(MI)) {
10705 const MachineOperand &srcOp = MI.getOperand(1);
10706 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10707 const TargetRegisterClass *regClass =
10708 RI.getPhysRegBaseClass(srcOp.getReg());
10709 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10711 }
10713 }
10714
10715 // GMIR handling
10716 if (MI.isPreISelOpcode())
10718
10719 // Atomics are divergent because they are executed sequentially: when an
10720 // atomic operation refers to the same address in each thread, then each
10721 // thread after the first sees the value written by the previous thread as
10722 // original value.
10723
10724 if (isAtomic(MI))
10726
10727 // Loads from the private and flat address spaces are divergent, because
10728 // threads can execute the load instruction with the same inputs and get
10729 // different results.
10730 if (isFLAT(MI) && MI.mayLoad()) {
10731 if (MI.memoperands_empty())
10732 return InstructionUniformity::NeverUniform; // conservative assumption
10733
10734 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10735 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10736 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10737 })) {
10738 // At least one MMO in a non-global address space.
10740 }
10741
10743 }
10744
10745 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10746 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10747
10748 // FIXME: It's conceptually broken to report this for an instruction, and not
10749 // a specific def operand. For inline asm in particular, there could be mixed
10750 // uniform and divergent results.
10751 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10752 const MachineOperand &SrcOp = MI.getOperand(I);
10753 if (!SrcOp.isReg())
10754 continue;
10755
10756 Register Reg = SrcOp.getReg();
10757 if (!Reg || !SrcOp.readsReg())
10758 continue;
10759
10760 // If RegBank is null, this is unassigned or an unallocatable special
10761 // register, which are all scalars.
10762 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10763 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10765 }
10766
10767 // TODO: Uniformity check condtions above can be rearranged for more
10768 // redability
10769
10770 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10771 // currently turned into no-op COPYs by SelectionDAG ISel and are
10772 // therefore no longer recognizable.
10773
10775}
10776
10778 switch (MF.getFunction().getCallingConv()) {
10780 return 1;
10782 return 2;
10784 return 3;
10788 const Function &F = MF.getFunction();
10789 F.getContext().diagnose(DiagnosticInfoUnsupported(
10790 F, "ds_ordered_count unsupported for this calling conv"));
10791 [[fallthrough]];
10792 }
10795 case CallingConv::C:
10796 case CallingConv::Fast:
10797 default:
10798 // Assume other calling conventions are various compute callable functions
10799 return 0;
10800 }
10801}
10802
10804 Register &SrcReg2, int64_t &CmpMask,
10805 int64_t &CmpValue) const {
10806 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10807 return false;
10808
10809 switch (MI.getOpcode()) {
10810 default:
10811 break;
10812 case AMDGPU::S_CMP_EQ_U32:
10813 case AMDGPU::S_CMP_EQ_I32:
10814 case AMDGPU::S_CMP_LG_U32:
10815 case AMDGPU::S_CMP_LG_I32:
10816 case AMDGPU::S_CMP_LT_U32:
10817 case AMDGPU::S_CMP_LT_I32:
10818 case AMDGPU::S_CMP_GT_U32:
10819 case AMDGPU::S_CMP_GT_I32:
10820 case AMDGPU::S_CMP_LE_U32:
10821 case AMDGPU::S_CMP_LE_I32:
10822 case AMDGPU::S_CMP_GE_U32:
10823 case AMDGPU::S_CMP_GE_I32:
10824 case AMDGPU::S_CMP_EQ_U64:
10825 case AMDGPU::S_CMP_LG_U64:
10826 SrcReg = MI.getOperand(0).getReg();
10827 if (MI.getOperand(1).isReg()) {
10828 if (MI.getOperand(1).getSubReg())
10829 return false;
10830 SrcReg2 = MI.getOperand(1).getReg();
10831 CmpValue = 0;
10832 } else if (MI.getOperand(1).isImm()) {
10833 SrcReg2 = Register();
10834 CmpValue = MI.getOperand(1).getImm();
10835 } else {
10836 return false;
10837 }
10838 CmpMask = ~0;
10839 return true;
10840 case AMDGPU::S_CMPK_EQ_U32:
10841 case AMDGPU::S_CMPK_EQ_I32:
10842 case AMDGPU::S_CMPK_LG_U32:
10843 case AMDGPU::S_CMPK_LG_I32:
10844 case AMDGPU::S_CMPK_LT_U32:
10845 case AMDGPU::S_CMPK_LT_I32:
10846 case AMDGPU::S_CMPK_GT_U32:
10847 case AMDGPU::S_CMPK_GT_I32:
10848 case AMDGPU::S_CMPK_LE_U32:
10849 case AMDGPU::S_CMPK_LE_I32:
10850 case AMDGPU::S_CMPK_GE_U32:
10851 case AMDGPU::S_CMPK_GE_I32:
10852 SrcReg = MI.getOperand(0).getReg();
10853 SrcReg2 = Register();
10854 CmpValue = MI.getOperand(1).getImm();
10855 CmpMask = ~0;
10856 return true;
10857 }
10858
10859 return false;
10860}
10861
10863 for (MachineBasicBlock *S : MBB->successors()) {
10864 if (S->isLiveIn(AMDGPU::SCC))
10865 return false;
10866 }
10867 return true;
10868}
10869
10870// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
10871// (incoming SCC) = !(SCC defined by SCCDef).
10872// Return true if all uses can be re-written, false otherwise.
10873bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
10874 MachineBasicBlock *MBB = SCCDef->getParent();
10875 SmallVector<MachineInstr *> InvertInstr;
10876 bool SCCIsDead = false;
10877
10878 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
10879 constexpr unsigned ScanLimit = 12;
10880 unsigned Count = 0;
10881 for (MachineInstr &MI :
10882 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
10883 if (++Count > ScanLimit)
10884 return false;
10885 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
10886 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
10887 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
10888 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10889 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
10890 InvertInstr.push_back(&MI);
10891 else
10892 return false;
10893 }
10894 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
10895 SCCIsDead = true;
10896 break;
10897 }
10898 }
10899 if (!SCCIsDead && isSCCDeadOnExit(MBB))
10900 SCCIsDead = true;
10901
10902 // SCC may have more uses. Can't invert all of them.
10903 if (!SCCIsDead)
10904 return false;
10905
10906 // Invert uses
10907 for (MachineInstr *MI : InvertInstr) {
10908 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
10909 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
10910 swapOperands(*MI);
10911 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10912 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
10913 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
10914 ? AMDGPU::S_CBRANCH_SCC1
10915 : AMDGPU::S_CBRANCH_SCC0));
10916 } else {
10917 llvm_unreachable("SCC used but no inversion handling");
10918 }
10919 }
10920 return true;
10921}
10922
10923// SCC is already valid after SCCValid.
10924// SCCRedefine will redefine SCC to the same value already available after
10925// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
10926// update kill/dead flags if necessary.
10927bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
10928 bool NeedInversion) const {
10929 MachineInstr *KillsSCC = nullptr;
10930 if (SCCValid->getParent() != SCCRedefine->getParent())
10931 return false;
10932 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
10933 SCCRedefine->getIterator())) {
10934 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
10935 return false;
10936 if (MI.killsRegister(AMDGPU::SCC, &RI))
10937 KillsSCC = &MI;
10938 }
10939 if (NeedInversion && !invertSCCUse(SCCRedefine))
10940 return false;
10941 if (MachineOperand *SccDef =
10942 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
10943 SccDef->setIsDead(false);
10944 if (KillsSCC)
10945 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
10946 SCCRedefine->eraseFromParent();
10947 return true;
10948}
10949
10950static bool foldableSelect(const MachineInstr &Def) {
10951 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
10952 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
10953 return false;
10954 bool Op1IsNonZeroImm =
10955 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
10956 bool Op2IsZeroImm =
10957 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
10958 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
10959 return false;
10960 return true;
10961}
10962
10964 Register SrcReg2, int64_t CmpMask,
10965 int64_t CmpValue,
10966 const MachineRegisterInfo *MRI) const {
10967 if (!SrcReg || SrcReg.isPhysical())
10968 return false;
10969
10970 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10971 return false;
10972
10973 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
10974 this](bool NeedInversion) -> bool {
10975 if (CmpValue != 0)
10976 return false;
10977
10978 MachineInstr *Def = MRI->getVRegDef(SrcReg);
10979 if (!Def)
10980 return false;
10981
10982 // For S_OP that set SCC = DST!=0, do the transformation
10983 //
10984 // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
10985
10986 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
10987 // for S_CSELECT* already has the same value that will be calculated by
10988 // s_cmp_lg_*
10989 //
10990 // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
10991 // imm), 0)
10992 if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
10993 return false;
10994
10995 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
10996 return false;
10997
10998 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
10999 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11000 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11001 // sX = s_cselect_b64 (non-zero imm), 0
11002 // sLo = copy sX.sub0
11003 // sHi = copy sX.sub1
11004 // sY = s_or_b32 sLo, sHi
11005 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11006 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11007 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11008 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11009 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11010 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11011 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11012 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11013 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11014 Def2->getOperand(1).isReg() &&
11015 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11016 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11017 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11018 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11019 if (Select && foldableSelect(*Select))
11020 optimizeSCC(Select, Def, false);
11021 }
11022 }
11023 }
11024 return true;
11025 };
11026
11027 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11028 this](int64_t ExpectedValue, unsigned SrcSize,
11029 bool IsReversible, bool IsSigned) -> bool {
11030 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11031 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11032 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11033 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11034 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11035 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11036 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11037 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11038 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11039 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11040 //
11041 // Signed ge/gt are not used for the sign bit.
11042 //
11043 // If result of the AND is unused except in the compare:
11044 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11045 //
11046 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11047 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11048 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11049 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11050 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11051 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11052
11053 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11054 if (!Def)
11055 return false;
11056
11057 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11058 Def->getOpcode() != AMDGPU::S_AND_B64)
11059 return false;
11060
11061 int64_t Mask;
11062 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11063 if (MO->isImm())
11064 Mask = MO->getImm();
11065 else if (!getFoldableImm(MO, Mask))
11066 return false;
11067 Mask &= maxUIntN(SrcSize);
11068 return isPowerOf2_64(Mask);
11069 };
11070
11071 MachineOperand *SrcOp = &Def->getOperand(1);
11072 if (isMask(SrcOp))
11073 SrcOp = &Def->getOperand(2);
11074 else if (isMask(&Def->getOperand(2)))
11075 SrcOp = &Def->getOperand(1);
11076 else
11077 return false;
11078
11079 // A valid Mask is required to have a single bit set, hence a non-zero and
11080 // power-of-two value. This verifies that we will not do 64-bit shift below.
11081 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11082 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11083 if (IsSigned && BitNo == SrcSize - 1)
11084 return false;
11085
11086 ExpectedValue <<= BitNo;
11087
11088 bool IsReversedCC = false;
11089 if (CmpValue != ExpectedValue) {
11090 if (!IsReversible)
11091 return false;
11092 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11093 if (!IsReversedCC)
11094 return false;
11095 }
11096
11097 Register DefReg = Def->getOperand(0).getReg();
11098 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11099 return false;
11100
11101 if (!optimizeSCC(Def, &CmpInstr, false))
11102 return false;
11103
11104 if (!MRI->use_nodbg_empty(DefReg)) {
11105 assert(!IsReversedCC);
11106 return true;
11107 }
11108
11109 // Replace AND with unused result with a S_BITCMP.
11110 MachineBasicBlock *MBB = Def->getParent();
11111
11112 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11113 : AMDGPU::S_BITCMP1_B32
11114 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11115 : AMDGPU::S_BITCMP1_B64;
11116
11117 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11118 .add(*SrcOp)
11119 .addImm(BitNo);
11120 Def->eraseFromParent();
11121
11122 return true;
11123 };
11124
11125 switch (CmpInstr.getOpcode()) {
11126 default:
11127 break;
11128 case AMDGPU::S_CMP_EQ_U32:
11129 case AMDGPU::S_CMP_EQ_I32:
11130 case AMDGPU::S_CMPK_EQ_U32:
11131 case AMDGPU::S_CMPK_EQ_I32:
11132 return optimizeCmpAnd(1, 32, true, false) ||
11133 optimizeCmpSelect(/*NeedInversion=*/true);
11134 case AMDGPU::S_CMP_GE_U32:
11135 case AMDGPU::S_CMPK_GE_U32:
11136 return optimizeCmpAnd(1, 32, false, false);
11137 case AMDGPU::S_CMP_GE_I32:
11138 case AMDGPU::S_CMPK_GE_I32:
11139 return optimizeCmpAnd(1, 32, false, true);
11140 case AMDGPU::S_CMP_EQ_U64:
11141 return optimizeCmpAnd(1, 64, true, false);
11142 case AMDGPU::S_CMP_LG_U32:
11143 case AMDGPU::S_CMP_LG_I32:
11144 case AMDGPU::S_CMPK_LG_U32:
11145 case AMDGPU::S_CMPK_LG_I32:
11146 return optimizeCmpAnd(0, 32, true, false) ||
11147 optimizeCmpSelect(/*NeedInversion=*/false);
11148 case AMDGPU::S_CMP_GT_U32:
11149 case AMDGPU::S_CMPK_GT_U32:
11150 return optimizeCmpAnd(0, 32, false, false);
11151 case AMDGPU::S_CMP_GT_I32:
11152 case AMDGPU::S_CMPK_GT_I32:
11153 return optimizeCmpAnd(0, 32, false, true);
11154 case AMDGPU::S_CMP_LG_U64:
11155 return optimizeCmpAnd(0, 64, true, false) ||
11156 optimizeCmpSelect(/*NeedInversion=*/false);
11157 }
11158
11159 return false;
11160}
11161
11163 AMDGPU::OpName OpName) const {
11164 if (!ST.needsAlignedVGPRs())
11165 return;
11166
11167 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11168 if (OpNo < 0)
11169 return;
11170 MachineOperand &Op = MI.getOperand(OpNo);
11171 if (getOpSize(MI, OpNo) > 4)
11172 return;
11173
11174 // Add implicit aligned super-reg to force alignment on the data operand.
11175 const DebugLoc &DL = MI.getDebugLoc();
11176 MachineBasicBlock *BB = MI.getParent();
11178 Register DataReg = Op.getReg();
11179 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11180 Register Undef = MRI.createVirtualRegister(
11181 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11182 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11183 Register NewVR =
11184 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11185 : &AMDGPU::VReg_64_Align2RegClass);
11186 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11187 .addReg(DataReg, 0, Op.getSubReg())
11188 .addImm(AMDGPU::sub0)
11189 .addReg(Undef)
11190 .addImm(AMDGPU::sub1);
11191 Op.setReg(NewVR);
11192 Op.setSubReg(AMDGPU::sub0);
11193 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11194}
11195
11197 if (isIGLP(*MI))
11198 return false;
11199
11201}
11202
11204 if (!isWMMA(MI) && !isSWMMAC(MI))
11205 return false;
11206
11207 if (AMDGPU::isGFX1250(ST))
11208 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11209
11210 return true;
11211}
11212
11214 unsigned Opcode = MI.getOpcode();
11215
11216 if (AMDGPU::isGFX12Plus(ST))
11217 return isDOT(MI) || isXDLWMMA(MI);
11218
11219 if (!isMAI(MI) || isDGEMM(Opcode) ||
11220 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11221 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11222 return false;
11223
11224 if (!ST.hasGFX940Insts())
11225 return true;
11226
11227 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11228}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:144
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1571
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const final
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const override
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool setsSCCifResultIsNonZero(const MachineInstr &MI)
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:232
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_IMM_V2FP16_SPLAT
Definition SIDefines.h:210
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:217
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:212
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:226
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:237
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:238
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:213
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:249
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:224
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:243
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:239
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:229
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:587
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:589
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:586
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:588
@ TI_CONSTDATA_START
Definition AMDGPU.h:585
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr unsigned getKillRegState(bool B)
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
constexpr unsigned getUndefRegState(bool B)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.