LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
67 AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
86 AMDGPU::OpName OpName) {
87 unsigned Opc0 = N0->getMachineOpcode();
88 unsigned Opc1 = N1->getMachineOpcode();
89
90 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
91 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
92
93 if (Op0Idx == -1 && Op1Idx == -1)
94 return true;
95
96
97 if ((Op0Idx == -1 && Op1Idx != -1) ||
98 (Op1Idx == -1 && Op0Idx != -1))
99 return false;
100
101 // getNamedOperandIdx returns the index for the MachineInstr's operands,
102 // which includes the result as the first operand. We are indexing into the
103 // MachineSDNode's operands, so we need to skip the result operand to get
104 // the real index.
105 --Op0Idx;
106 --Op1Idx;
107
108 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
109}
110
111static bool canRemat(const MachineInstr &MI) {
112
116 return true;
117
118 if (SIInstrInfo::isSMRD(MI)) {
119 return !MI.memoperands_empty() &&
120 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
121 return MMO->isLoad() && MMO->isInvariant();
122 });
123 }
124
125 return false;
126}
127
129 const MachineInstr &MI) const {
130
131 if (canRemat(MI)) {
132 // Normally VALU use of exec would block the rematerialization, but that
133 // is OK in this case to have an implicit exec read as all VALU do.
134 // We really want all of the generic logic for this except for this.
135
136 // Another potential implicit use is mode register. The core logic of
137 // the RA will not attempt rematerialization if mode is set anywhere
138 // in the function, otherwise it is safe since mode is not changed.
139
140 // There is difference to generic method which does not allow
141 // rematerialization if there are virtual register uses. We allow this,
142 // therefore this method includes SOP instructions as well.
143 if (!MI.hasImplicitDef() &&
144 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
145 !MI.mayRaiseFPException())
146 return true;
147 }
148
150}
151
152// Returns true if the scalar result of a VALU instruction depends on exec.
153bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
154 // Ignore comparisons which are only used masked with exec.
155 // This allows some hoisting/sinking of VALU comparisons.
156 if (MI.isCompare()) {
157 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
158 if (!Dst)
159 return true;
160
161 Register DstReg = Dst->getReg();
162 if (!DstReg.isVirtual())
163 return true;
164
165 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
166 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
167 switch (Use.getOpcode()) {
168 case AMDGPU::S_AND_SAVEEXEC_B32:
169 case AMDGPU::S_AND_SAVEEXEC_B64:
170 break;
171 case AMDGPU::S_AND_B32:
172 case AMDGPU::S_AND_B64:
173 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
174 return true;
175 break;
176 default:
177 return true;
178 }
179 }
180 return false;
181 }
182
183 switch (MI.getOpcode()) {
184 default:
185 break;
186 case AMDGPU::V_READFIRSTLANE_B32:
187 return true;
188 }
189
190 return false;
191}
192
194 // Any implicit use of exec by VALU is not a real register read.
195 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
196 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
197}
198
200 MachineBasicBlock *SuccToSinkTo,
201 MachineCycleInfo *CI) const {
202 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
203 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
204 return true;
205
206 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
207 // Check if sinking of MI would create temporal divergent use.
208 for (auto Op : MI.uses()) {
209 if (Op.isReg() && Op.getReg().isVirtual() &&
210 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
211 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
212
213 // SgprDef defined inside cycle
214 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
215 if (FromCycle == nullptr)
216 continue;
217
218 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
219 // Check if there is a FromCycle that contains SgprDef's basic block but
220 // does not contain SuccToSinkTo and also has divergent exit condition.
221 while (FromCycle && !FromCycle->contains(ToCycle)) {
223 FromCycle->getExitingBlocks(ExitingBlocks);
224
225 // FromCycle has divergent exit condition.
226 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
227 if (hasDivergentBranch(ExitingBlock))
228 return false;
229 }
230
231 FromCycle = FromCycle->getParentCycle();
232 }
233 }
234 }
235
236 return true;
237}
238
240 int64_t &Offset0,
241 int64_t &Offset1) const {
242 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
243 return false;
244
245 unsigned Opc0 = Load0->getMachineOpcode();
246 unsigned Opc1 = Load1->getMachineOpcode();
247
248 // Make sure both are actually loads.
249 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
250 return false;
251
252 // A mayLoad instruction without a def is not a load. Likely a prefetch.
253 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
254 return false;
255
256 if (isDS(Opc0) && isDS(Opc1)) {
257
258 // FIXME: Handle this case:
259 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
260 return false;
261
262 // Check base reg.
263 if (Load0->getOperand(0) != Load1->getOperand(0))
264 return false;
265
266 // Skip read2 / write2 variants for simplicity.
267 // TODO: We should report true if the used offsets are adjacent (excluded
268 // st64 versions).
269 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
270 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
271 if (Offset0Idx == -1 || Offset1Idx == -1)
272 return false;
273
274 // XXX - be careful of dataless loads
275 // getNamedOperandIdx returns the index for MachineInstrs. Since they
276 // include the output in the operand list, but SDNodes don't, we need to
277 // subtract the index by one.
278 Offset0Idx -= get(Opc0).NumDefs;
279 Offset1Idx -= get(Opc1).NumDefs;
280 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
281 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
282 return true;
283 }
284
285 if (isSMRD(Opc0) && isSMRD(Opc1)) {
286 // Skip time and cache invalidation instructions.
287 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
288 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
289 return false;
290
291 unsigned NumOps = getNumOperandsNoGlue(Load0);
292 if (NumOps != getNumOperandsNoGlue(Load1))
293 return false;
294
295 // Check base reg.
296 if (Load0->getOperand(0) != Load1->getOperand(0))
297 return false;
298
299 // Match register offsets, if both register and immediate offsets present.
300 assert(NumOps == 4 || NumOps == 5);
301 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
302 return false;
303
304 const ConstantSDNode *Load0Offset =
306 const ConstantSDNode *Load1Offset =
308
309 if (!Load0Offset || !Load1Offset)
310 return false;
311
312 Offset0 = Load0Offset->getZExtValue();
313 Offset1 = Load1Offset->getZExtValue();
314 return true;
315 }
316
317 // MUBUF and MTBUF can access the same addresses.
318 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
319
320 // MUBUF and MTBUF have vaddr at different indices.
321 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
323 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
324 return false;
325
326 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
327 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
328
329 if (OffIdx0 == -1 || OffIdx1 == -1)
330 return false;
331
332 // getNamedOperandIdx returns the index for MachineInstrs. Since they
333 // include the output in the operand list, but SDNodes don't, we need to
334 // subtract the index by one.
335 OffIdx0 -= get(Opc0).NumDefs;
336 OffIdx1 -= get(Opc1).NumDefs;
337
338 SDValue Off0 = Load0->getOperand(OffIdx0);
339 SDValue Off1 = Load1->getOperand(OffIdx1);
340
341 // The offset might be a FrameIndexSDNode.
342 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
343 return false;
344
345 Offset0 = Off0->getAsZExtVal();
346 Offset1 = Off1->getAsZExtVal();
347 return true;
348 }
349
350 return false;
351}
352
353static bool isStride64(unsigned Opc) {
354 switch (Opc) {
355 case AMDGPU::DS_READ2ST64_B32:
356 case AMDGPU::DS_READ2ST64_B64:
357 case AMDGPU::DS_WRITE2ST64_B32:
358 case AMDGPU::DS_WRITE2ST64_B64:
359 return true;
360 default:
361 return false;
362 }
363}
364
367 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
368 const TargetRegisterInfo *TRI) const {
369 if (!LdSt.mayLoadOrStore())
370 return false;
371
372 unsigned Opc = LdSt.getOpcode();
373 OffsetIsScalable = false;
374 const MachineOperand *BaseOp, *OffsetOp;
375 int DataOpIdx;
376
377 if (isDS(LdSt)) {
378 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
379 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
380 if (OffsetOp) {
381 // Normal, single offset LDS instruction.
382 if (!BaseOp) {
383 // DS_CONSUME/DS_APPEND use M0 for the base address.
384 // TODO: find the implicit use operand for M0 and use that as BaseOp?
385 return false;
386 }
387 BaseOps.push_back(BaseOp);
388 Offset = OffsetOp->getImm();
389 // Get appropriate operand, and compute width accordingly.
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
391 if (DataOpIdx == -1)
392 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
393 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
394 Width = LocationSize::precise(64);
395 else
396 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
397 } else {
398 // The 2 offset instructions use offset0 and offset1 instead. We can treat
399 // these as a load with a single offset if the 2 offsets are consecutive.
400 // We will use this for some partially aligned loads.
401 const MachineOperand *Offset0Op =
402 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
403 const MachineOperand *Offset1Op =
404 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
405
406 unsigned Offset0 = Offset0Op->getImm() & 0xff;
407 unsigned Offset1 = Offset1Op->getImm() & 0xff;
408 if (Offset0 + 1 != Offset1)
409 return false;
410
411 // Each of these offsets is in element sized units, so we need to convert
412 // to bytes of the individual reads.
413
414 unsigned EltSize;
415 if (LdSt.mayLoad())
416 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
417 else {
418 assert(LdSt.mayStore());
419 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
421 }
422
423 if (isStride64(Opc))
424 EltSize *= 64;
425
426 BaseOps.push_back(BaseOp);
427 Offset = EltSize * Offset0;
428 // Get appropriate operand(s), and compute width accordingly.
429 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
430 if (DataOpIdx == -1) {
431 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
432 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
433 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
434 Width = LocationSize::precise(
435 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
436 } else {
437 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
438 }
439 }
440 return true;
441 }
442
443 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
444 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
445 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
446 return false;
447 BaseOps.push_back(RSrc);
448 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
449 if (BaseOp && !BaseOp->isFI())
450 BaseOps.push_back(BaseOp);
451 const MachineOperand *OffsetImm =
452 getNamedOperand(LdSt, AMDGPU::OpName::offset);
453 Offset = OffsetImm->getImm();
454 const MachineOperand *SOffset =
455 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
456 if (SOffset) {
457 if (SOffset->isReg())
458 BaseOps.push_back(SOffset);
459 else
460 Offset += SOffset->getImm();
461 }
462 // Get appropriate operand, and compute width accordingly.
463 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
464 if (DataOpIdx == -1)
465 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
466 if (DataOpIdx == -1) // LDS DMA
467 return false;
468 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
469 return true;
470 }
471
472 if (isImage(LdSt)) {
473 auto RsrcOpName =
474 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
475 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
476 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
477 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
478 if (VAddr0Idx >= 0) {
479 // GFX10 possible NSA encoding.
480 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
481 BaseOps.push_back(&LdSt.getOperand(I));
482 } else {
483 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
484 }
485 Offset = 0;
486 // Get appropriate operand, and compute width accordingly.
487 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
488 if (DataOpIdx == -1)
489 return false; // no return sampler
490 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
491 return true;
492 }
493
494 if (isSMRD(LdSt)) {
495 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
496 if (!BaseOp) // e.g. S_MEMTIME
497 return false;
498 BaseOps.push_back(BaseOp);
499 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
500 Offset = OffsetOp ? OffsetOp->getImm() : 0;
501 // Get appropriate operand, and compute width accordingly.
502 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
503 if (DataOpIdx == -1)
504 return false;
505 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
506 return true;
507 }
508
509 if (isFLAT(LdSt)) {
510 // Instructions have either vaddr or saddr or both or none.
511 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
512 if (BaseOp)
513 BaseOps.push_back(BaseOp);
514 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
515 if (BaseOp)
516 BaseOps.push_back(BaseOp);
517 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
518 // Get appropriate operand, and compute width accordingly.
519 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
520 if (DataOpIdx == -1)
521 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
522 if (DataOpIdx == -1) // LDS DMA
523 return false;
524 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
525 return true;
526 }
527
528 return false;
529}
530
531static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
533 const MachineInstr &MI2,
535 // Only examine the first "base" operand of each instruction, on the
536 // assumption that it represents the real base address of the memory access.
537 // Other operands are typically offsets or indices from this base address.
538 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
539 return true;
540
541 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
542 return false;
543
544 auto *MO1 = *MI1.memoperands_begin();
545 auto *MO2 = *MI2.memoperands_begin();
546 if (MO1->getAddrSpace() != MO2->getAddrSpace())
547 return false;
548
549 const auto *Base1 = MO1->getValue();
550 const auto *Base2 = MO2->getValue();
551 if (!Base1 || !Base2)
552 return false;
553 Base1 = getUnderlyingObject(Base1);
554 Base2 = getUnderlyingObject(Base2);
555
556 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
557 return false;
558
559 return Base1 == Base2;
560}
561
563 int64_t Offset1, bool OffsetIsScalable1,
565 int64_t Offset2, bool OffsetIsScalable2,
566 unsigned ClusterSize,
567 unsigned NumBytes) const {
568 // If the mem ops (to be clustered) do not have the same base ptr, then they
569 // should not be clustered
570 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
571 if (!BaseOps1.empty() && !BaseOps2.empty()) {
572 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
573 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
574 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
575 return false;
576
577 const SIMachineFunctionInfo *MFI =
578 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
579 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
580 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
581 // If only one base op is empty, they do not have the same base ptr
582 return false;
583 }
584
585 // In order to avoid register pressure, on an average, the number of DWORDS
586 // loaded together by all clustered mem ops should not exceed
587 // MaxMemoryClusterDWords. This is an empirical value based on certain
588 // observations and performance related experiments.
589 // The good thing about this heuristic is - it avoids clustering of too many
590 // sub-word loads, and also avoids clustering of wide loads. Below is the
591 // brief summary of how the heuristic behaves for various `LoadSize` when
592 // MaxMemoryClusterDWords is 8.
593 //
594 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
595 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
596 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
597 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
598 // (5) LoadSize >= 17: do not cluster
599 const unsigned LoadSize = NumBytes / ClusterSize;
600 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
601 return NumDWords <= MaxMemoryClusterDWords;
602}
603
604// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
605// the first 16 loads will be interleaved with the stores, and the next 16 will
606// be clustered as expected. It should really split into 2 16 store batches.
607//
608// Loads are clustered until this returns false, rather than trying to schedule
609// groups of stores. This also means we have to deal with saying different
610// address space loads should be clustered, and ones which might cause bank
611// conflicts.
612//
613// This might be deprecated so it might not be worth that much effort to fix.
615 int64_t Offset0, int64_t Offset1,
616 unsigned NumLoads) const {
617 assert(Offset1 > Offset0 &&
618 "Second offset should be larger than first offset!");
619 // If we have less than 16 loads in a row, and the offsets are within 64
620 // bytes, then schedule together.
621
622 // A cacheline is 64 bytes (for global memory).
623 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
624}
625
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 const char *Msg = "illegal VGPR to SGPR copy") {
631 MachineFunction *MF = MBB.getParent();
632
634 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
635
636 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
637 .addReg(SrcReg, getKillRegState(KillSrc));
638}
639
640/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
641/// possible to have a direct copy in these cases on GFX908, so an intermediate
642/// VGPR copy is required.
646 const DebugLoc &DL, MCRegister DestReg,
647 MCRegister SrcReg, bool KillSrc,
648 RegScavenger &RS, bool RegsOverlap,
649 Register ImpDefSuperReg = Register(),
650 Register ImpUseSuperReg = Register()) {
651 assert((TII.getSubtarget().hasMAIInsts() &&
652 !TII.getSubtarget().hasGFX90AInsts()) &&
653 "Expected GFX908 subtarget.");
654
655 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
656 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
657 "Source register of the copy should be either an SGPR or an AGPR.");
658
659 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
660 "Destination register of the copy should be an AGPR.");
661
662 const SIRegisterInfo &RI = TII.getRegisterInfo();
663
664 // First try to find defining accvgpr_write to avoid temporary registers.
665 // In the case of copies of overlapping AGPRs, we conservatively do not
666 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
667 // an accvgpr_write used for this same copy due to implicit-defs
668 if (!RegsOverlap) {
669 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
670 --Def;
671
672 if (!Def->modifiesRegister(SrcReg, &RI))
673 continue;
674
675 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
676 Def->getOperand(0).getReg() != SrcReg)
677 break;
678
679 MachineOperand &DefOp = Def->getOperand(1);
680 assert(DefOp.isReg() || DefOp.isImm());
681
682 if (DefOp.isReg()) {
683 bool SafeToPropagate = true;
684 // Check that register source operand is not clobbered before MI.
685 // Immediate operands are always safe to propagate.
686 for (auto I = Def; I != MI && SafeToPropagate; ++I)
687 if (I->modifiesRegister(DefOp.getReg(), &RI))
688 SafeToPropagate = false;
689
690 if (!SafeToPropagate)
691 break;
692
693 for (auto I = Def; I != MI; ++I)
694 I->clearRegisterKills(DefOp.getReg(), &RI);
695 }
696
697 MachineInstrBuilder Builder =
698 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
699 .add(DefOp);
700 if (ImpDefSuperReg)
701 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
702
703 if (ImpUseSuperReg) {
704 Builder.addReg(ImpUseSuperReg,
706 }
707
708 return;
709 }
710 }
711
712 RS.enterBasicBlockEnd(MBB);
713 RS.backward(std::next(MI));
714
715 // Ideally we want to have three registers for a long reg_sequence copy
716 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
717 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
718 *MBB.getParent());
719
720 // Registers in the sequence are allocated contiguously so we can just
721 // use register number to pick one of three round-robin temps.
722 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
723 Register Tmp =
724 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
725 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
726 "VGPR used for an intermediate copy should have been reserved.");
727
728 // Only loop through if there are any free registers left. We don't want to
729 // spill.
730 while (RegNo--) {
731 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
732 /* RestoreAfter */ false, 0,
733 /* AllowSpill */ false);
734 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
735 break;
736 Tmp = Tmp2;
737 RS.setRegUsed(Tmp);
738 }
739
740 // Insert copy to temporary VGPR.
741 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
742 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
743 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
744 } else {
745 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
746 }
747
748 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
749 .addReg(SrcReg, getKillRegState(KillSrc));
750 if (ImpUseSuperReg) {
751 UseBuilder.addReg(ImpUseSuperReg,
753 }
754
755 MachineInstrBuilder DefBuilder
756 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
757 .addReg(Tmp, RegState::Kill);
758
759 if (ImpDefSuperReg)
760 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
761}
762
765 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
766 const TargetRegisterClass *RC, bool Forward) {
767 const SIRegisterInfo &RI = TII.getRegisterInfo();
768 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
770 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
771
772 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
773 int16_t SubIdx = BaseIndices[Idx];
774 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
775 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
776 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
777 unsigned Opcode = AMDGPU::S_MOV_B32;
778
779 // Is SGPR aligned? If so try to combine with next.
780 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
781 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
782 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
783 // Can use SGPR64 copy
784 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
785 SubIdx = RI.getSubRegFromChannel(Channel, 2);
786 DestSubReg = RI.getSubReg(DestReg, SubIdx);
787 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
788 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
789 Opcode = AMDGPU::S_MOV_B64;
790 Idx++;
791 }
792
793 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
794 .addReg(SrcSubReg)
795 .addReg(SrcReg, RegState::Implicit);
796
797 if (!FirstMI)
798 FirstMI = LastMI;
799
800 if (!Forward)
801 I--;
802 }
803
804 assert(FirstMI && LastMI);
805 if (!Forward)
806 std::swap(FirstMI, LastMI);
807
808 FirstMI->addOperand(
809 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
810
811 if (KillSrc)
812 LastMI->addRegisterKilled(SrcReg, &RI);
813}
814
817 const DebugLoc &DL, Register DestReg,
818 Register SrcReg, bool KillSrc, bool RenamableDest,
819 bool RenamableSrc) const {
820 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
821 unsigned Size = RI.getRegSizeInBits(*RC);
822 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
823 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
824
825 // The rest of copyPhysReg assumes Src and Dst size are the same size.
826 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
827 // we remove Fix16BitCopies and this code block?
828 if (Fix16BitCopies) {
829 if (((Size == 16) != (SrcSize == 16))) {
830 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
831 assert(ST.useRealTrue16Insts());
832 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
833 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
834 RegToFix = SubReg;
835
836 if (DestReg == SrcReg) {
837 // Identity copy. Insert empty bundle since ExpandPostRA expects an
838 // instruction here.
839 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
840 return;
841 }
842 RC = RI.getPhysRegBaseClass(DestReg);
843 Size = RI.getRegSizeInBits(*RC);
844 SrcRC = RI.getPhysRegBaseClass(SrcReg);
845 SrcSize = RI.getRegSizeInBits(*SrcRC);
846 }
847 }
848
849 if (RC == &AMDGPU::VGPR_32RegClass) {
850 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
851 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
852 AMDGPU::AGPR_32RegClass.contains(SrcReg));
853 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
854 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
855 BuildMI(MBB, MI, DL, get(Opc), DestReg)
856 .addReg(SrcReg, getKillRegState(KillSrc));
857 return;
858 }
859
860 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
861 RC == &AMDGPU::SReg_32RegClass) {
862 if (SrcReg == AMDGPU::SCC) {
863 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
864 .addImm(1)
865 .addImm(0);
866 return;
867 }
868
869 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
870 if (DestReg == AMDGPU::VCC_LO) {
871 // FIXME: Hack until VReg_1 removed.
872 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
873 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
874 .addImm(0)
875 .addReg(SrcReg, getKillRegState(KillSrc));
876 return;
877 }
878
879 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
880 return;
881 }
882
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 return;
886 }
887
888 if (RC == &AMDGPU::SReg_64RegClass) {
889 if (SrcReg == AMDGPU::SCC) {
890 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
891 .addImm(1)
892 .addImm(0);
893 return;
894 }
895
896 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
897 if (DestReg == AMDGPU::VCC) {
898 // FIXME: Hack until VReg_1 removed.
899 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
900 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
901 .addImm(0)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
904 }
905
906 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
907 return;
908 }
909
910 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
911 .addReg(SrcReg, getKillRegState(KillSrc));
912 return;
913 }
914
915 if (DestReg == AMDGPU::SCC) {
916 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
917 // but SelectionDAG emits such copies for i1 sources.
918 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
919 // This copy can only be produced by patterns
920 // with explicit SCC, which are known to be enabled
921 // only for subtargets with S_CMP_LG_U64 present.
922 assert(ST.hasScalarCompareEq64());
923 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
924 .addReg(SrcReg, getKillRegState(KillSrc))
925 .addImm(0);
926 } else {
927 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
928 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
929 .addReg(SrcReg, getKillRegState(KillSrc))
930 .addImm(0);
931 }
932
933 return;
934 }
935
936 if (RC == &AMDGPU::AGPR_32RegClass) {
937 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
938 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
939 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
940 .addReg(SrcReg, getKillRegState(KillSrc));
941 return;
942 }
943
944 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
945 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
946 .addReg(SrcReg, getKillRegState(KillSrc));
947 return;
948 }
949
950 // FIXME: Pass should maintain scavenger to avoid scan through the block on
951 // every AGPR spill.
952 RegScavenger RS;
953 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
954 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
955 return;
956 }
957
958 if (Size == 16) {
959 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
960 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
961 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
962
963 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
964 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
965 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
966 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
967 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
968 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
969 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
970 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
971
972 if (IsSGPRDst) {
973 if (!IsSGPRSrc) {
974 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
975 return;
976 }
977
978 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
979 .addReg(NewSrcReg, getKillRegState(KillSrc));
980 return;
981 }
982
983 if (IsAGPRDst || IsAGPRSrc) {
984 if (!DstLow || !SrcLow) {
985 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
986 "Cannot use hi16 subreg with an AGPR!");
987 }
988
989 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
990 return;
991 }
992
993 if (ST.useRealTrue16Insts()) {
994 if (IsSGPRSrc) {
995 assert(SrcLow);
996 SrcReg = NewSrcReg;
997 }
998 // Use the smaller instruction encoding if possible.
999 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1000 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1001 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1002 .addReg(SrcReg);
1003 } else {
1004 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1005 .addImm(0) // src0_modifiers
1006 .addReg(SrcReg)
1007 .addImm(0); // op_sel
1008 }
1009 return;
1010 }
1011
1012 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1013 if (!DstLow || !SrcLow) {
1014 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1015 "Cannot use hi16 subreg on VI!");
1016 }
1017
1018 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1019 .addReg(NewSrcReg, getKillRegState(KillSrc));
1020 return;
1021 }
1022
1023 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1024 .addImm(0) // src0_modifiers
1025 .addReg(NewSrcReg)
1026 .addImm(0) // clamp
1033 // First implicit operand is $exec.
1034 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1035 return;
1036 }
1037
1038 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1039 if (ST.hasMovB64()) {
1040 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1041 .addReg(SrcReg, getKillRegState(KillSrc));
1042 return;
1043 }
1044 if (ST.hasPkMovB32()) {
1045 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1047 .addReg(SrcReg)
1049 .addReg(SrcReg)
1050 .addImm(0) // op_sel_lo
1051 .addImm(0) // op_sel_hi
1052 .addImm(0) // neg_lo
1053 .addImm(0) // neg_hi
1054 .addImm(0) // clamp
1055 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1056 return;
1057 }
1058 }
1059
1060 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1061 if (RI.isSGPRClass(RC)) {
1062 if (!RI.isSGPRClass(SrcRC)) {
1063 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1064 return;
1065 }
1066 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1067 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1068 Forward);
1069 return;
1070 }
1071
1072 unsigned EltSize = 4;
1073 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1074 if (RI.isAGPRClass(RC)) {
1075 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1076 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1077 else if (RI.hasVGPRs(SrcRC) ||
1078 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1079 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1080 else
1081 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1082 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1083 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1084 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1085 (RI.isProperlyAlignedRC(*RC) &&
1086 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1087 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1088 if (ST.hasMovB64()) {
1089 Opcode = AMDGPU::V_MOV_B64_e32;
1090 EltSize = 8;
1091 } else if (ST.hasPkMovB32()) {
1092 Opcode = AMDGPU::V_PK_MOV_B32;
1093 EltSize = 8;
1094 }
1095 }
1096
1097 // For the cases where we need an intermediate instruction/temporary register
1098 // (destination is an AGPR), we need a scavenger.
1099 //
1100 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1101 // whole block for every handled copy.
1102 std::unique_ptr<RegScavenger> RS;
1103 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1104 RS = std::make_unique<RegScavenger>();
1105
1106 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1107
1108 // If there is an overlap, we can't kill the super-register on the last
1109 // instruction, since it will also kill the components made live by this def.
1110 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1111 const bool CanKillSuperReg = KillSrc && !Overlap;
1112
1113 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1114 unsigned SubIdx;
1115 if (Forward)
1116 SubIdx = SubIndices[Idx];
1117 else
1118 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1119 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1120 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1121 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1122
1123 bool IsFirstSubreg = Idx == 0;
1124 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1125
1126 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1127 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1128 Register ImpUseSuper = SrcReg;
1129 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1130 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1131 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1133 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1135 .addReg(SrcSubReg)
1137 .addReg(SrcSubReg)
1138 .addImm(0) // op_sel_lo
1139 .addImm(0) // op_sel_hi
1140 .addImm(0) // neg_lo
1141 .addImm(0) // neg_hi
1142 .addImm(0) // clamp
1143 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144 if (IsFirstSubreg)
1146 } else {
1147 MachineInstrBuilder Builder =
1148 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1149 if (IsFirstSubreg)
1150 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1151
1152 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1153 }
1154 }
1155}
1156
1157int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1158 int NewOpc;
1159
1160 // Try to map original to commuted opcode
1161 NewOpc = AMDGPU::getCommuteRev(Opcode);
1162 if (NewOpc != -1)
1163 // Check if the commuted (REV) opcode exists on the target.
1164 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1165
1166 // Try to map commuted to original opcode
1167 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1168 if (NewOpc != -1)
1169 // Check if the original (non-REV) opcode exists on the target.
1170 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1171
1172 return Opcode;
1173}
1174
1175const TargetRegisterClass *
1177 return &AMDGPU::VGPR_32RegClass;
1178}
1179
1182 const DebugLoc &DL, Register DstReg,
1184 Register TrueReg,
1185 Register FalseReg) const {
1186 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1187 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1189 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1190 "Not a VGPR32 reg");
1191
1192 if (Cond.size() == 1) {
1193 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1194 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1195 .add(Cond[0]);
1196 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1197 .addImm(0)
1198 .addReg(FalseReg)
1199 .addImm(0)
1200 .addReg(TrueReg)
1201 .addReg(SReg);
1202 } else if (Cond.size() == 2) {
1203 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1204 switch (Cond[0].getImm()) {
1205 case SIInstrInfo::SCC_TRUE: {
1206 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1207 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1208 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1209 .addImm(0)
1210 .addReg(FalseReg)
1211 .addImm(0)
1212 .addReg(TrueReg)
1213 .addReg(SReg);
1214 break;
1215 }
1216 case SIInstrInfo::SCC_FALSE: {
1217 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1218 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1219 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1220 .addImm(0)
1221 .addReg(FalseReg)
1222 .addImm(0)
1223 .addReg(TrueReg)
1224 .addReg(SReg);
1225 break;
1226 }
1227 case SIInstrInfo::VCCNZ: {
1228 MachineOperand RegOp = Cond[1];
1229 RegOp.setImplicit(false);
1230 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1231 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1232 .add(RegOp);
1233 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1234 .addImm(0)
1235 .addReg(FalseReg)
1236 .addImm(0)
1237 .addReg(TrueReg)
1238 .addReg(SReg);
1239 break;
1240 }
1241 case SIInstrInfo::VCCZ: {
1242 MachineOperand RegOp = Cond[1];
1243 RegOp.setImplicit(false);
1244 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1245 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1246 .add(RegOp);
1247 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1248 .addImm(0)
1249 .addReg(TrueReg)
1250 .addImm(0)
1251 .addReg(FalseReg)
1252 .addReg(SReg);
1253 break;
1254 }
1255 case SIInstrInfo::EXECNZ: {
1256 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1257 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1258 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1259 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1260 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1261 .addImm(0)
1262 .addReg(FalseReg)
1263 .addImm(0)
1264 .addReg(TrueReg)
1265 .addReg(SReg);
1266 break;
1267 }
1268 case SIInstrInfo::EXECZ: {
1269 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1270 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1271 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1272 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1273 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1274 .addImm(0)
1275 .addReg(FalseReg)
1276 .addImm(0)
1277 .addReg(TrueReg)
1278 .addReg(SReg);
1279 llvm_unreachable("Unhandled branch predicate EXECZ");
1280 break;
1281 }
1282 default:
1283 llvm_unreachable("invalid branch predicate");
1284 }
1285 } else {
1286 llvm_unreachable("Can only handle Cond size 1 or 2");
1287 }
1288}
1289
1292 const DebugLoc &DL,
1293 Register SrcReg, int Value) const {
1294 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1295 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1296 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1297 .addImm(Value)
1298 .addReg(SrcReg);
1299
1300 return Reg;
1301}
1302
1305 const DebugLoc &DL,
1306 Register SrcReg, int Value) const {
1307 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1308 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1309 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1310 .addImm(Value)
1311 .addReg(SrcReg);
1312
1313 return Reg;
1314}
1315
1317 const Register Reg,
1318 int64_t &ImmVal) const {
1319 switch (MI.getOpcode()) {
1320 case AMDGPU::V_MOV_B32_e32:
1321 case AMDGPU::S_MOV_B32:
1322 case AMDGPU::S_MOVK_I32:
1323 case AMDGPU::S_MOV_B64:
1324 case AMDGPU::V_MOV_B64_e32:
1325 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1326 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1327 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1328 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1329 case AMDGPU::V_MOV_B64_PSEUDO:
1330 case AMDGPU::V_MOV_B16_t16_e32: {
1331 const MachineOperand &Src0 = MI.getOperand(1);
1332 if (Src0.isImm()) {
1333 ImmVal = Src0.getImm();
1334 return MI.getOperand(0).getReg() == Reg;
1335 }
1336
1337 return false;
1338 }
1339 case AMDGPU::V_MOV_B16_t16_e64: {
1340 const MachineOperand &Src0 = MI.getOperand(2);
1341 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1342 ImmVal = Src0.getImm();
1343 return MI.getOperand(0).getReg() == Reg;
1344 }
1345
1346 return false;
1347 }
1348 case AMDGPU::S_BREV_B32:
1349 case AMDGPU::V_BFREV_B32_e32:
1350 case AMDGPU::V_BFREV_B32_e64: {
1351 const MachineOperand &Src0 = MI.getOperand(1);
1352 if (Src0.isImm()) {
1353 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1354 return MI.getOperand(0).getReg() == Reg;
1355 }
1356
1357 return false;
1358 }
1359 case AMDGPU::S_NOT_B32:
1360 case AMDGPU::V_NOT_B32_e32:
1361 case AMDGPU::V_NOT_B32_e64: {
1362 const MachineOperand &Src0 = MI.getOperand(1);
1363 if (Src0.isImm()) {
1364 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1365 return MI.getOperand(0).getReg() == Reg;
1366 }
1367
1368 return false;
1369 }
1370 default:
1371 return false;
1372 }
1373}
1374
1375std::optional<int64_t>
1377 if (Op.isImm())
1378 return Op.getImm();
1379
1380 if (!Op.isReg() || !Op.getReg().isVirtual())
1381 return std::nullopt;
1382 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1383 const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1384 if (Def && Def->isMoveImmediate()) {
1385 const MachineOperand &ImmSrc = Def->getOperand(1);
1386 if (ImmSrc.isImm())
1387 return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1388 }
1389
1390 return std::nullopt;
1391}
1392
1394
1395 if (RI.isAGPRClass(DstRC))
1396 return AMDGPU::COPY;
1397 if (RI.getRegSizeInBits(*DstRC) == 16) {
1398 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1399 // before RA.
1400 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1401 }
1402 if (RI.getRegSizeInBits(*DstRC) == 32)
1403 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1404 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1405 return AMDGPU::S_MOV_B64;
1406 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1407 return AMDGPU::V_MOV_B64_PSEUDO;
1408 return AMDGPU::COPY;
1409}
1410
1411const MCInstrDesc &
1413 bool IsIndirectSrc) const {
1414 if (IsIndirectSrc) {
1415 if (VecSize <= 32) // 4 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1417 if (VecSize <= 64) // 8 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1419 if (VecSize <= 96) // 12 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1421 if (VecSize <= 128) // 16 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1423 if (VecSize <= 160) // 20 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1425 if (VecSize <= 192) // 24 bytes
1426 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1427 if (VecSize <= 224) // 28 bytes
1428 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1429 if (VecSize <= 256) // 32 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1431 if (VecSize <= 288) // 36 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1433 if (VecSize <= 320) // 40 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1435 if (VecSize <= 352) // 44 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1437 if (VecSize <= 384) // 48 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1439 if (VecSize <= 512) // 64 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1441 if (VecSize <= 1024) // 128 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1443
1444 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1445 }
1446
1447 if (VecSize <= 32) // 4 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1449 if (VecSize <= 64) // 8 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1451 if (VecSize <= 96) // 12 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1453 if (VecSize <= 128) // 16 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1455 if (VecSize <= 160) // 20 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1457 if (VecSize <= 192) // 24 bytes
1458 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1459 if (VecSize <= 224) // 28 bytes
1460 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1461 if (VecSize <= 256) // 32 bytes
1462 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1463 if (VecSize <= 288) // 36 bytes
1464 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1465 if (VecSize <= 320) // 40 bytes
1466 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1467 if (VecSize <= 352) // 44 bytes
1468 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1469 if (VecSize <= 384) // 48 bytes
1470 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1471 if (VecSize <= 512) // 64 bytes
1472 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1473 if (VecSize <= 1024) // 128 bytes
1474 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1475
1476 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1477}
1478
1479static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1480 if (VecSize <= 32) // 4 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1482 if (VecSize <= 64) // 8 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1484 if (VecSize <= 96) // 12 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1486 if (VecSize <= 128) // 16 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1488 if (VecSize <= 160) // 20 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1490 if (VecSize <= 192) // 24 bytes
1491 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1492 if (VecSize <= 224) // 28 bytes
1493 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1494 if (VecSize <= 256) // 32 bytes
1495 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1496 if (VecSize <= 288) // 36 bytes
1497 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1498 if (VecSize <= 320) // 40 bytes
1499 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1500 if (VecSize <= 352) // 44 bytes
1501 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1502 if (VecSize <= 384) // 48 bytes
1503 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1504 if (VecSize <= 512) // 64 bytes
1505 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1506 if (VecSize <= 1024) // 128 bytes
1507 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1508
1509 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1510}
1511
1512static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1513 if (VecSize <= 32) // 4 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1515 if (VecSize <= 64) // 8 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1517 if (VecSize <= 96) // 12 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1519 if (VecSize <= 128) // 16 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1521 if (VecSize <= 160) // 20 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1523 if (VecSize <= 192) // 24 bytes
1524 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1525 if (VecSize <= 224) // 28 bytes
1526 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1527 if (VecSize <= 256) // 32 bytes
1528 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1529 if (VecSize <= 288) // 36 bytes
1530 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1531 if (VecSize <= 320) // 40 bytes
1532 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1533 if (VecSize <= 352) // 44 bytes
1534 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1535 if (VecSize <= 384) // 48 bytes
1536 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1537 if (VecSize <= 512) // 64 bytes
1538 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1539 if (VecSize <= 1024) // 128 bytes
1540 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1541
1542 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1543}
1544
1545static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1546 if (VecSize <= 64) // 8 bytes
1547 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1548 if (VecSize <= 128) // 16 bytes
1549 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1550 if (VecSize <= 256) // 32 bytes
1551 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1552 if (VecSize <= 512) // 64 bytes
1553 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1554 if (VecSize <= 1024) // 128 bytes
1555 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1556
1557 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1558}
1559
1560const MCInstrDesc &
1561SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1562 bool IsSGPR) const {
1563 if (IsSGPR) {
1564 switch (EltSize) {
1565 case 32:
1566 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1567 case 64:
1568 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1569 default:
1570 llvm_unreachable("invalid reg indexing elt size");
1571 }
1572 }
1573
1574 assert(EltSize == 32 && "invalid reg indexing elt size");
1576}
1577
1578static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1579 switch (Size) {
1580 case 4:
1581 return AMDGPU::SI_SPILL_S32_SAVE;
1582 case 8:
1583 return AMDGPU::SI_SPILL_S64_SAVE;
1584 case 12:
1585 return AMDGPU::SI_SPILL_S96_SAVE;
1586 case 16:
1587 return AMDGPU::SI_SPILL_S128_SAVE;
1588 case 20:
1589 return AMDGPU::SI_SPILL_S160_SAVE;
1590 case 24:
1591 return AMDGPU::SI_SPILL_S192_SAVE;
1592 case 28:
1593 return AMDGPU::SI_SPILL_S224_SAVE;
1594 case 32:
1595 return AMDGPU::SI_SPILL_S256_SAVE;
1596 case 36:
1597 return AMDGPU::SI_SPILL_S288_SAVE;
1598 case 40:
1599 return AMDGPU::SI_SPILL_S320_SAVE;
1600 case 44:
1601 return AMDGPU::SI_SPILL_S352_SAVE;
1602 case 48:
1603 return AMDGPU::SI_SPILL_S384_SAVE;
1604 case 64:
1605 return AMDGPU::SI_SPILL_S512_SAVE;
1606 case 128:
1607 return AMDGPU::SI_SPILL_S1024_SAVE;
1608 default:
1609 llvm_unreachable("unknown register size");
1610 }
1611}
1612
1613static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1614 switch (Size) {
1615 case 2:
1616 return AMDGPU::SI_SPILL_V16_SAVE;
1617 case 4:
1618 return AMDGPU::SI_SPILL_V32_SAVE;
1619 case 8:
1620 return AMDGPU::SI_SPILL_V64_SAVE;
1621 case 12:
1622 return AMDGPU::SI_SPILL_V96_SAVE;
1623 case 16:
1624 return AMDGPU::SI_SPILL_V128_SAVE;
1625 case 20:
1626 return AMDGPU::SI_SPILL_V160_SAVE;
1627 case 24:
1628 return AMDGPU::SI_SPILL_V192_SAVE;
1629 case 28:
1630 return AMDGPU::SI_SPILL_V224_SAVE;
1631 case 32:
1632 return AMDGPU::SI_SPILL_V256_SAVE;
1633 case 36:
1634 return AMDGPU::SI_SPILL_V288_SAVE;
1635 case 40:
1636 return AMDGPU::SI_SPILL_V320_SAVE;
1637 case 44:
1638 return AMDGPU::SI_SPILL_V352_SAVE;
1639 case 48:
1640 return AMDGPU::SI_SPILL_V384_SAVE;
1641 case 64:
1642 return AMDGPU::SI_SPILL_V512_SAVE;
1643 case 128:
1644 return AMDGPU::SI_SPILL_V1024_SAVE;
1645 default:
1646 llvm_unreachable("unknown register size");
1647 }
1648}
1649
1650static unsigned getAVSpillSaveOpcode(unsigned Size) {
1651 switch (Size) {
1652 case 4:
1653 return AMDGPU::SI_SPILL_AV32_SAVE;
1654 case 8:
1655 return AMDGPU::SI_SPILL_AV64_SAVE;
1656 case 12:
1657 return AMDGPU::SI_SPILL_AV96_SAVE;
1658 case 16:
1659 return AMDGPU::SI_SPILL_AV128_SAVE;
1660 case 20:
1661 return AMDGPU::SI_SPILL_AV160_SAVE;
1662 case 24:
1663 return AMDGPU::SI_SPILL_AV192_SAVE;
1664 case 28:
1665 return AMDGPU::SI_SPILL_AV224_SAVE;
1666 case 32:
1667 return AMDGPU::SI_SPILL_AV256_SAVE;
1668 case 36:
1669 return AMDGPU::SI_SPILL_AV288_SAVE;
1670 case 40:
1671 return AMDGPU::SI_SPILL_AV320_SAVE;
1672 case 44:
1673 return AMDGPU::SI_SPILL_AV352_SAVE;
1674 case 48:
1675 return AMDGPU::SI_SPILL_AV384_SAVE;
1676 case 64:
1677 return AMDGPU::SI_SPILL_AV512_SAVE;
1678 case 128:
1679 return AMDGPU::SI_SPILL_AV1024_SAVE;
1680 default:
1681 llvm_unreachable("unknown register size");
1682 }
1683}
1684
1685static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1686 bool IsVectorSuperClass) {
1687 // Currently, there is only 32-bit WWM register spills needed.
1688 if (Size != 4)
1689 llvm_unreachable("unknown wwm register spill size");
1690
1691 if (IsVectorSuperClass)
1692 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1693
1694 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1695}
1696
1698 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1699 const SIMachineFunctionInfo &MFI) const {
1700 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1701
1702 // Choose the right opcode if spilling a WWM register.
1704 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1705
1706 // TODO: Check if AGPRs are available
1707 if (ST.hasMAIInsts())
1708 return getAVSpillSaveOpcode(Size);
1709
1711}
1712
1715 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1716 MachineInstr::MIFlag Flags) const {
1717 MachineFunction *MF = MBB.getParent();
1719 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1720 const DebugLoc &DL = MBB.findDebugLoc(MI);
1721
1722 MachinePointerInfo PtrInfo
1723 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1725 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1726 FrameInfo.getObjectAlign(FrameIndex));
1727 unsigned SpillSize = RI.getSpillSize(*RC);
1728
1730 if (RI.isSGPRClass(RC)) {
1731 MFI->setHasSpilledSGPRs();
1732 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1733 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1734 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1735
1736 // We are only allowed to create one new instruction when spilling
1737 // registers, so we need to use pseudo instruction for spilling SGPRs.
1738 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1739
1740 // The SGPR spill/restore instructions only work on number sgprs, so we need
1741 // to make sure we are using the correct register class.
1742 if (SrcReg.isVirtual() && SpillSize == 4) {
1743 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1744 }
1745
1746 BuildMI(MBB, MI, DL, OpDesc)
1747 .addReg(SrcReg, getKillRegState(isKill)) // data
1748 .addFrameIndex(FrameIndex) // addr
1749 .addMemOperand(MMO)
1751
1752 if (RI.spillSGPRToVGPR())
1753 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1754 return;
1755 }
1756
1757 unsigned Opcode =
1758 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1759 MFI->setHasSpilledVGPRs();
1760
1761 BuildMI(MBB, MI, DL, get(Opcode))
1762 .addReg(SrcReg, getKillRegState(isKill)) // data
1763 .addFrameIndex(FrameIndex) // addr
1764 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1765 .addImm(0) // offset
1766 .addMemOperand(MMO);
1767}
1768
1769static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1770 switch (Size) {
1771 case 4:
1772 return AMDGPU::SI_SPILL_S32_RESTORE;
1773 case 8:
1774 return AMDGPU::SI_SPILL_S64_RESTORE;
1775 case 12:
1776 return AMDGPU::SI_SPILL_S96_RESTORE;
1777 case 16:
1778 return AMDGPU::SI_SPILL_S128_RESTORE;
1779 case 20:
1780 return AMDGPU::SI_SPILL_S160_RESTORE;
1781 case 24:
1782 return AMDGPU::SI_SPILL_S192_RESTORE;
1783 case 28:
1784 return AMDGPU::SI_SPILL_S224_RESTORE;
1785 case 32:
1786 return AMDGPU::SI_SPILL_S256_RESTORE;
1787 case 36:
1788 return AMDGPU::SI_SPILL_S288_RESTORE;
1789 case 40:
1790 return AMDGPU::SI_SPILL_S320_RESTORE;
1791 case 44:
1792 return AMDGPU::SI_SPILL_S352_RESTORE;
1793 case 48:
1794 return AMDGPU::SI_SPILL_S384_RESTORE;
1795 case 64:
1796 return AMDGPU::SI_SPILL_S512_RESTORE;
1797 case 128:
1798 return AMDGPU::SI_SPILL_S1024_RESTORE;
1799 default:
1800 llvm_unreachable("unknown register size");
1801 }
1802}
1803
1804static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1805 switch (Size) {
1806 case 2:
1807 return AMDGPU::SI_SPILL_V16_RESTORE;
1808 case 4:
1809 return AMDGPU::SI_SPILL_V32_RESTORE;
1810 case 8:
1811 return AMDGPU::SI_SPILL_V64_RESTORE;
1812 case 12:
1813 return AMDGPU::SI_SPILL_V96_RESTORE;
1814 case 16:
1815 return AMDGPU::SI_SPILL_V128_RESTORE;
1816 case 20:
1817 return AMDGPU::SI_SPILL_V160_RESTORE;
1818 case 24:
1819 return AMDGPU::SI_SPILL_V192_RESTORE;
1820 case 28:
1821 return AMDGPU::SI_SPILL_V224_RESTORE;
1822 case 32:
1823 return AMDGPU::SI_SPILL_V256_RESTORE;
1824 case 36:
1825 return AMDGPU::SI_SPILL_V288_RESTORE;
1826 case 40:
1827 return AMDGPU::SI_SPILL_V320_RESTORE;
1828 case 44:
1829 return AMDGPU::SI_SPILL_V352_RESTORE;
1830 case 48:
1831 return AMDGPU::SI_SPILL_V384_RESTORE;
1832 case 64:
1833 return AMDGPU::SI_SPILL_V512_RESTORE;
1834 case 128:
1835 return AMDGPU::SI_SPILL_V1024_RESTORE;
1836 default:
1837 llvm_unreachable("unknown register size");
1838 }
1839}
1840
1841static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1842 switch (Size) {
1843 case 4:
1844 return AMDGPU::SI_SPILL_AV32_RESTORE;
1845 case 8:
1846 return AMDGPU::SI_SPILL_AV64_RESTORE;
1847 case 12:
1848 return AMDGPU::SI_SPILL_AV96_RESTORE;
1849 case 16:
1850 return AMDGPU::SI_SPILL_AV128_RESTORE;
1851 case 20:
1852 return AMDGPU::SI_SPILL_AV160_RESTORE;
1853 case 24:
1854 return AMDGPU::SI_SPILL_AV192_RESTORE;
1855 case 28:
1856 return AMDGPU::SI_SPILL_AV224_RESTORE;
1857 case 32:
1858 return AMDGPU::SI_SPILL_AV256_RESTORE;
1859 case 36:
1860 return AMDGPU::SI_SPILL_AV288_RESTORE;
1861 case 40:
1862 return AMDGPU::SI_SPILL_AV320_RESTORE;
1863 case 44:
1864 return AMDGPU::SI_SPILL_AV352_RESTORE;
1865 case 48:
1866 return AMDGPU::SI_SPILL_AV384_RESTORE;
1867 case 64:
1868 return AMDGPU::SI_SPILL_AV512_RESTORE;
1869 case 128:
1870 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1871 default:
1872 llvm_unreachable("unknown register size");
1873 }
1874}
1875
1876static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1877 bool IsVectorSuperClass) {
1878 // Currently, there is only 32-bit WWM register spills needed.
1879 if (Size != 4)
1880 llvm_unreachable("unknown wwm register spill size");
1881
1882 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1883 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1884
1885 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1886}
1887
1889 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1890 const SIMachineFunctionInfo &MFI) const {
1891 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1892
1893 // Choose the right opcode if restoring a WWM register.
1895 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1896
1897 // TODO: Check if AGPRs are available
1898 if (ST.hasMAIInsts())
1900
1901 assert(!RI.isAGPRClass(RC));
1903}
1904
1907 Register DestReg, int FrameIndex,
1908 const TargetRegisterClass *RC,
1909 Register VReg, unsigned SubReg,
1910 MachineInstr::MIFlag Flags) const {
1911 MachineFunction *MF = MBB.getParent();
1913 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1914 const DebugLoc &DL = MBB.findDebugLoc(MI);
1915 unsigned SpillSize = RI.getSpillSize(*RC);
1916
1917 MachinePointerInfo PtrInfo
1918 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1919
1921 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1922 FrameInfo.getObjectAlign(FrameIndex));
1923
1924 if (RI.isSGPRClass(RC)) {
1925 MFI->setHasSpilledSGPRs();
1926 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1927 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1928 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1929
1930 // FIXME: Maybe this should not include a memoperand because it will be
1931 // lowered to non-memory instructions.
1932 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1933 if (DestReg.isVirtual() && SpillSize == 4) {
1935 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1936 }
1937
1938 if (RI.spillSGPRToVGPR())
1939 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1940 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1941 .addFrameIndex(FrameIndex) // addr
1942 .addMemOperand(MMO)
1944
1945 return;
1946 }
1947
1948 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1949 SpillSize, *MFI);
1950 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1951 .addFrameIndex(FrameIndex) // vaddr
1952 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1953 .addImm(0) // offset
1954 .addMemOperand(MMO);
1955}
1956
1961
1964 unsigned Quantity) const {
1965 DebugLoc DL = MBB.findDebugLoc(MI);
1966 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1967 while (Quantity > 0) {
1968 unsigned Arg = std::min(Quantity, MaxSNopCount);
1969 Quantity -= Arg;
1970 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1971 }
1972}
1973
1975 auto *MF = MBB.getParent();
1976 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1977
1978 assert(Info->isEntryFunction());
1979
1980 if (MBB.succ_empty()) {
1981 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1982 if (HasNoTerminator) {
1983 if (Info->returnsVoid()) {
1984 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1985 } else {
1986 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1987 }
1988 }
1989 }
1990}
1991
1995 const DebugLoc &DL) const {
1996 MachineFunction *MF = MBB.getParent();
1997 constexpr unsigned DoorbellIDMask = 0x3ff;
1998 constexpr unsigned ECQueueWaveAbort = 0x400;
1999
2000 MachineBasicBlock *TrapBB = &MBB;
2001 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2002
2003 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2004 MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2005 TrapBB = MF->CreateMachineBasicBlock();
2006 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2007 MF->push_back(TrapBB);
2008 MBB.addSuccessor(TrapBB);
2009 }
2010 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2011 // will be a nop.
2012 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2013 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2014 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2015 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2016 DoorbellReg)
2018 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2019 .addUse(AMDGPU::M0);
2020 Register DoorbellRegMasked =
2021 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2022 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2023 .addUse(DoorbellReg)
2024 .addImm(DoorbellIDMask);
2025 Register SetWaveAbortBit =
2026 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2027 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2028 .addUse(DoorbellRegMasked)
2029 .addImm(ECQueueWaveAbort);
2030 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2031 .addUse(SetWaveAbortBit);
2032 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2034 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2035 .addUse(AMDGPU::TTMP2);
2036 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2037 TrapBB->addSuccessor(HaltLoopBB);
2038
2039 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2040 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2041 .addMBB(HaltLoopBB);
2042 MF->push_back(HaltLoopBB);
2043 HaltLoopBB->addSuccessor(HaltLoopBB);
2044
2045 return MBB.getNextNode();
2046}
2047
2049 switch (MI.getOpcode()) {
2050 default:
2051 if (MI.isMetaInstruction())
2052 return 0;
2053 return 1; // FIXME: Do wait states equal cycles?
2054
2055 case AMDGPU::S_NOP:
2056 return MI.getOperand(0).getImm() + 1;
2057 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2058 // hazard, even if one exist, won't really be visible. Should we handle it?
2059 }
2060}
2061
2063 MachineBasicBlock &MBB = *MI.getParent();
2064 DebugLoc DL = MBB.findDebugLoc(MI);
2066 switch (MI.getOpcode()) {
2067 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2068 case AMDGPU::S_MOV_B64_term:
2069 // This is only a terminator to get the correct spill code placement during
2070 // register allocation.
2071 MI.setDesc(get(AMDGPU::S_MOV_B64));
2072 break;
2073
2074 case AMDGPU::S_MOV_B32_term:
2075 // This is only a terminator to get the correct spill code placement during
2076 // register allocation.
2077 MI.setDesc(get(AMDGPU::S_MOV_B32));
2078 break;
2079
2080 case AMDGPU::S_XOR_B64_term:
2081 // This is only a terminator to get the correct spill code placement during
2082 // register allocation.
2083 MI.setDesc(get(AMDGPU::S_XOR_B64));
2084 break;
2085
2086 case AMDGPU::S_XOR_B32_term:
2087 // This is only a terminator to get the correct spill code placement during
2088 // register allocation.
2089 MI.setDesc(get(AMDGPU::S_XOR_B32));
2090 break;
2091 case AMDGPU::S_OR_B64_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_OR_B64));
2095 break;
2096 case AMDGPU::S_OR_B32_term:
2097 // This is only a terminator to get the correct spill code placement during
2098 // register allocation.
2099 MI.setDesc(get(AMDGPU::S_OR_B32));
2100 break;
2101
2102 case AMDGPU::S_ANDN2_B64_term:
2103 // This is only a terminator to get the correct spill code placement during
2104 // register allocation.
2105 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2106 break;
2107
2108 case AMDGPU::S_ANDN2_B32_term:
2109 // This is only a terminator to get the correct spill code placement during
2110 // register allocation.
2111 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2112 break;
2113
2114 case AMDGPU::S_AND_B64_term:
2115 // This is only a terminator to get the correct spill code placement during
2116 // register allocation.
2117 MI.setDesc(get(AMDGPU::S_AND_B64));
2118 break;
2119
2120 case AMDGPU::S_AND_B32_term:
2121 // This is only a terminator to get the correct spill code placement during
2122 // register allocation.
2123 MI.setDesc(get(AMDGPU::S_AND_B32));
2124 break;
2125
2126 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2127 // This is only a terminator to get the correct spill code placement during
2128 // register allocation.
2129 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2130 break;
2131
2132 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2133 // This is only a terminator to get the correct spill code placement during
2134 // register allocation.
2135 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2136 break;
2137
2138 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2139 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2140 break;
2141
2142 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2143 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2144 break;
2145 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2146 Register Dst = MI.getOperand(0).getReg();
2147 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2148 MI.setDesc(
2149 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2150 break;
2151 }
2152 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2153 Register Dst = MI.getOperand(0).getReg();
2154 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2155 int64_t Imm = MI.getOperand(1).getImm();
2156
2157 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2158 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2159 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2162 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2163 .addImm(SignExtend64<32>(Imm >> 32))
2165 MI.eraseFromParent();
2166 break;
2167 }
2168
2169 [[fallthrough]];
2170 }
2171 case AMDGPU::V_MOV_B64_PSEUDO: {
2172 Register Dst = MI.getOperand(0).getReg();
2173 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2174 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2175
2176 const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
2177 const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
2178
2179 const MachineOperand &SrcOp = MI.getOperand(1);
2180 // FIXME: Will this work for 64-bit floating point immediates?
2181 assert(!SrcOp.isFPImm());
2182 if (ST.hasMovB64() && Mov64RC->contains(Dst)) {
2183 MI.setDesc(Mov64Desc);
2184 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2185 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2186 break;
2187 }
2188 if (SrcOp.isImm()) {
2189 APInt Imm(64, SrcOp.getImm());
2190 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2191 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2192 const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
2193 const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
2194
2195 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
2196 PkMovRC->contains(Dst)) {
2197 BuildMI(MBB, MI, DL, PkMovDesc, Dst)
2199 .addImm(Lo.getSExtValue())
2201 .addImm(Lo.getSExtValue())
2202 .addImm(0) // op_sel_lo
2203 .addImm(0) // op_sel_hi
2204 .addImm(0) // neg_lo
2205 .addImm(0) // neg_hi
2206 .addImm(0); // clamp
2207 } else {
2208 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2209 .addImm(Lo.getSExtValue())
2211 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2212 .addImm(Hi.getSExtValue())
2214 }
2215 } else {
2216 assert(SrcOp.isReg());
2217 if (ST.hasPkMovB32() &&
2218 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2219 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2220 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2221 .addReg(SrcOp.getReg())
2223 .addReg(SrcOp.getReg())
2224 .addImm(0) // op_sel_lo
2225 .addImm(0) // op_sel_hi
2226 .addImm(0) // neg_lo
2227 .addImm(0) // neg_hi
2228 .addImm(0); // clamp
2229 } else {
2230 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2231 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2233 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2234 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2236 }
2237 }
2238 MI.eraseFromParent();
2239 break;
2240 }
2241 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2243 break;
2244 }
2245 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2246 const MachineOperand &SrcOp = MI.getOperand(1);
2247 assert(!SrcOp.isFPImm());
2248
2249 if (ST.has64BitLiterals()) {
2250 MI.setDesc(get(AMDGPU::S_MOV_B64));
2251 break;
2252 }
2253
2254 APInt Imm(64, SrcOp.getImm());
2255 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2256 MI.setDesc(get(AMDGPU::S_MOV_B64));
2257 break;
2258 }
2259
2260 Register Dst = MI.getOperand(0).getReg();
2261 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2262 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2263
2264 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2265 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2266 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2267 .addImm(Lo.getSExtValue())
2269 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2270 .addImm(Hi.getSExtValue())
2272 MI.eraseFromParent();
2273 break;
2274 }
2275 case AMDGPU::V_SET_INACTIVE_B32: {
2276 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2277 Register DstReg = MI.getOperand(0).getReg();
2278 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2279 .add(MI.getOperand(3))
2280 .add(MI.getOperand(4))
2281 .add(MI.getOperand(1))
2282 .add(MI.getOperand(2))
2283 .add(MI.getOperand(5));
2284 MI.eraseFromParent();
2285 break;
2286 }
2287 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2288 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2289 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2290 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2291 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2292 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2294 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2298 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2301 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2302 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2303 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2304 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2305 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2320 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2321
2322 unsigned Opc;
2323 if (RI.hasVGPRs(EltRC)) {
2324 Opc = AMDGPU::V_MOVRELD_B32_e32;
2325 } else {
2326 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2327 : AMDGPU::S_MOVRELD_B32;
2328 }
2329
2330 const MCInstrDesc &OpDesc = get(Opc);
2331 Register VecReg = MI.getOperand(0).getReg();
2332 bool IsUndef = MI.getOperand(1).isUndef();
2333 unsigned SubReg = MI.getOperand(3).getImm();
2334 assert(VecReg == MI.getOperand(1).getReg());
2335
2337 BuildMI(MBB, MI, DL, OpDesc)
2338 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2339 .add(MI.getOperand(2))
2341 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2342
2343 const int ImpDefIdx =
2344 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2345 const int ImpUseIdx = ImpDefIdx + 1;
2346 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2347 MI.eraseFromParent();
2348 break;
2349 }
2350 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2351 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2352 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2353 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2357 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2358 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2359 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2360 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2361 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2362 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2363 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2364 assert(ST.useVGPRIndexMode());
2365 Register VecReg = MI.getOperand(0).getReg();
2366 bool IsUndef = MI.getOperand(1).isUndef();
2367 MachineOperand &Idx = MI.getOperand(3);
2368 Register SubReg = MI.getOperand(4).getImm();
2369
2370 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2371 .add(Idx)
2373 SetOn->getOperand(3).setIsUndef();
2374
2375 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2377 BuildMI(MBB, MI, DL, OpDesc)
2378 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2379 .add(MI.getOperand(2))
2381 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2382
2383 const int ImpDefIdx =
2384 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2385 const int ImpUseIdx = ImpDefIdx + 1;
2386 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2387
2388 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2389
2390 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2391
2392 MI.eraseFromParent();
2393 break;
2394 }
2395 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2396 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2397 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2398 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2402 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2403 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2404 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2405 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2406 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2407 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2408 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2409 assert(ST.useVGPRIndexMode());
2410 Register Dst = MI.getOperand(0).getReg();
2411 Register VecReg = MI.getOperand(1).getReg();
2412 bool IsUndef = MI.getOperand(1).isUndef();
2413 Register Idx = MI.getOperand(2).getReg();
2414 Register SubReg = MI.getOperand(3).getImm();
2415
2416 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2417 .addReg(Idx)
2419 SetOn->getOperand(3).setIsUndef();
2420
2421 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2422 .addDef(Dst)
2423 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2424 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2425
2426 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2427
2428 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2429
2430 MI.eraseFromParent();
2431 break;
2432 }
2433 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2434 MachineFunction &MF = *MBB.getParent();
2435 Register Reg = MI.getOperand(0).getReg();
2436 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2437 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2438 MachineOperand OpLo = MI.getOperand(1);
2439 MachineOperand OpHi = MI.getOperand(2);
2440
2441 // Create a bundle so these instructions won't be re-ordered by the
2442 // post-RA scheduler.
2443 MIBundleBuilder Bundler(MBB, MI);
2444 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2445
2446 // What we want here is an offset from the value returned by s_getpc (which
2447 // is the address of the s_add_u32 instruction) to the global variable, but
2448 // since the encoding of $symbol starts 4 bytes after the start of the
2449 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2450 // small. This requires us to add 4 to the global variable offset in order
2451 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2452 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2453 // instruction.
2454
2455 int64_t Adjust = 0;
2456 if (ST.hasGetPCZeroExtension()) {
2457 // Fix up hardware that does not sign-extend the 48-bit PC value by
2458 // inserting: s_sext_i32_i16 reghi, reghi
2459 Bundler.append(
2460 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2461 Adjust += 4;
2462 }
2463
2464 if (OpLo.isGlobal())
2465 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2466 Bundler.append(
2467 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2468
2469 if (OpHi.isGlobal())
2470 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2471 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2472 .addReg(RegHi)
2473 .add(OpHi));
2474
2475 finalizeBundle(MBB, Bundler.begin());
2476
2477 MI.eraseFromParent();
2478 break;
2479 }
2480 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2481 MachineFunction &MF = *MBB.getParent();
2482 Register Reg = MI.getOperand(0).getReg();
2483 MachineOperand Op = MI.getOperand(1);
2484
2485 // Create a bundle so these instructions won't be re-ordered by the
2486 // post-RA scheduler.
2487 MIBundleBuilder Bundler(MBB, MI);
2488 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2489 if (Op.isGlobal())
2490 Op.setOffset(Op.getOffset() + 4);
2491 Bundler.append(
2492 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2493
2494 finalizeBundle(MBB, Bundler.begin());
2495
2496 MI.eraseFromParent();
2497 break;
2498 }
2499 case AMDGPU::ENTER_STRICT_WWM: {
2500 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2501 // Whole Wave Mode is entered.
2502 MI.setDesc(get(LMC.OrSaveExecOpc));
2503 break;
2504 }
2505 case AMDGPU::ENTER_STRICT_WQM: {
2506 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2507 // STRICT_WQM is entered.
2508 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2509 .addReg(LMC.ExecReg);
2510 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2511
2512 MI.eraseFromParent();
2513 break;
2514 }
2515 case AMDGPU::EXIT_STRICT_WWM:
2516 case AMDGPU::EXIT_STRICT_WQM: {
2517 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2518 // WWM/STICT_WQM is exited.
2519 MI.setDesc(get(LMC.MovOpc));
2520 break;
2521 }
2522 case AMDGPU::SI_RETURN: {
2523 const MachineFunction *MF = MBB.getParent();
2524 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2525 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2526 // Hiding the return address use with SI_RETURN may lead to extra kills in
2527 // the function and missing live-ins. We are fine in practice because callee
2528 // saved register handling ensures the register value is restored before
2529 // RET, but we need the undef flag here to appease the MachineVerifier
2530 // liveness checks.
2532 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2533 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2534
2535 MIB.copyImplicitOps(MI);
2536 MI.eraseFromParent();
2537 break;
2538 }
2539
2540 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2541 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2542 MI.setDesc(get(AMDGPU::S_MUL_U64));
2543 break;
2544
2545 case AMDGPU::S_GETPC_B64_pseudo:
2546 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2547 if (ST.hasGetPCZeroExtension()) {
2548 Register Dst = MI.getOperand(0).getReg();
2549 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2550 // Fix up hardware that does not sign-extend the 48-bit PC value by
2551 // inserting: s_sext_i32_i16 dsthi, dsthi
2552 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2553 DstHi)
2554 .addReg(DstHi);
2555 }
2556 break;
2557
2558 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2559 assert(ST.hasBF16PackedInsts());
2560 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2561 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2562 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2563 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2564 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2565 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2566 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2567 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2568 break;
2569 }
2570 }
2571
2572 return true;
2573}
2574
2577 unsigned SubIdx,
2578 const MachineInstr &Orig) const {
2579
2580 // Try shrinking the instruction to remat only the part needed for current
2581 // context.
2582 // TODO: Handle more cases.
2583 unsigned Opcode = Orig.getOpcode();
2584 switch (Opcode) {
2585 case AMDGPU::S_LOAD_DWORDX16_IMM:
2586 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2587 if (SubIdx != 0)
2588 break;
2589
2590 if (I == MBB.end())
2591 break;
2592
2593 if (I->isBundled())
2594 break;
2595
2596 // Look for a single use of the register that is also a subreg.
2597 Register RegToFind = Orig.getOperand(0).getReg();
2598 MachineOperand *UseMO = nullptr;
2599 for (auto &CandMO : I->operands()) {
2600 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2601 continue;
2602 if (UseMO) {
2603 UseMO = nullptr;
2604 break;
2605 }
2606 UseMO = &CandMO;
2607 }
2608 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2609 break;
2610
2611 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2612 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2613
2614 MachineFunction *MF = MBB.getParent();
2616 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2617
2618 unsigned NewOpcode = -1;
2619 if (SubregSize == 256)
2620 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2621 else if (SubregSize == 128)
2622 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2623 else
2624 break;
2625
2626 const MCInstrDesc &TID = get(NewOpcode);
2627 const TargetRegisterClass *NewRC =
2628 RI.getAllocatableClass(getRegClass(TID, 0));
2629 MRI.setRegClass(DestReg, NewRC);
2630
2631 UseMO->setReg(DestReg);
2632 UseMO->setSubReg(AMDGPU::NoSubRegister);
2633
2634 // Use a smaller load with the desired size, possibly with updated offset.
2635 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2636 MI->setDesc(TID);
2637 MI->getOperand(0).setReg(DestReg);
2638 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2639 if (Offset) {
2640 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2641 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2642 OffsetMO->setImm(FinalOffset);
2643 }
2645 for (const MachineMemOperand *MemOp : Orig.memoperands())
2646 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2647 SubregSize / 8));
2648 MI->setMemRefs(*MF, NewMMOs);
2649
2650 MBB.insert(I, MI);
2651 return;
2652 }
2653
2654 default:
2655 break;
2656 }
2657
2658 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
2659}
2660
2661std::pair<MachineInstr*, MachineInstr*>
2663 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2664
2665 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2667 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2668 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2669 return std::pair(&MI, nullptr);
2670 }
2671
2672 MachineBasicBlock &MBB = *MI.getParent();
2673 DebugLoc DL = MBB.findDebugLoc(MI);
2674 MachineFunction *MF = MBB.getParent();
2676 Register Dst = MI.getOperand(0).getReg();
2677 unsigned Part = 0;
2678 MachineInstr *Split[2];
2679
2680 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2681 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2682 if (Dst.isPhysical()) {
2683 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2684 } else {
2685 assert(MRI.isSSA());
2686 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2687 MovDPP.addDef(Tmp);
2688 }
2689
2690 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2691 const MachineOperand &SrcOp = MI.getOperand(I);
2692 assert(!SrcOp.isFPImm());
2693 if (SrcOp.isImm()) {
2694 APInt Imm(64, SrcOp.getImm());
2695 Imm.ashrInPlace(Part * 32);
2696 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2697 } else {
2698 assert(SrcOp.isReg());
2699 Register Src = SrcOp.getReg();
2700 if (Src.isPhysical())
2701 MovDPP.addReg(RI.getSubReg(Src, Sub));
2702 else
2703 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2704 }
2705 }
2706
2707 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2708 MovDPP.addImm(MO.getImm());
2709
2710 Split[Part] = MovDPP;
2711 ++Part;
2712 }
2713
2714 if (Dst.isVirtual())
2715 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2716 .addReg(Split[0]->getOperand(0).getReg())
2717 .addImm(AMDGPU::sub0)
2718 .addReg(Split[1]->getOperand(0).getReg())
2719 .addImm(AMDGPU::sub1);
2720
2721 MI.eraseFromParent();
2722 return std::pair(Split[0], Split[1]);
2723}
2724
2725std::optional<DestSourcePair>
2727 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2728 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2729
2730 return std::nullopt;
2731}
2732
2734 AMDGPU::OpName Src0OpName,
2735 MachineOperand &Src1,
2736 AMDGPU::OpName Src1OpName) const {
2737 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2738 if (!Src0Mods)
2739 return false;
2740
2741 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2742 assert(Src1Mods &&
2743 "All commutable instructions have both src0 and src1 modifiers");
2744
2745 int Src0ModsVal = Src0Mods->getImm();
2746 int Src1ModsVal = Src1Mods->getImm();
2747
2748 Src1Mods->setImm(Src0ModsVal);
2749 Src0Mods->setImm(Src1ModsVal);
2750 return true;
2751}
2752
2754 MachineOperand &RegOp,
2755 MachineOperand &NonRegOp) {
2756 Register Reg = RegOp.getReg();
2757 unsigned SubReg = RegOp.getSubReg();
2758 bool IsKill = RegOp.isKill();
2759 bool IsDead = RegOp.isDead();
2760 bool IsUndef = RegOp.isUndef();
2761 bool IsDebug = RegOp.isDebug();
2762
2763 if (NonRegOp.isImm())
2764 RegOp.ChangeToImmediate(NonRegOp.getImm());
2765 else if (NonRegOp.isFI())
2766 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2767 else if (NonRegOp.isGlobal()) {
2768 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2769 NonRegOp.getTargetFlags());
2770 } else
2771 return nullptr;
2772
2773 // Make sure we don't reinterpret a subreg index in the target flags.
2774 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2775
2776 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2777 NonRegOp.setSubReg(SubReg);
2778
2779 return &MI;
2780}
2781
2783 MachineOperand &NonRegOp1,
2784 MachineOperand &NonRegOp2) {
2785 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2786 int64_t NonRegVal = NonRegOp1.getImm();
2787
2788 NonRegOp1.setImm(NonRegOp2.getImm());
2789 NonRegOp2.setImm(NonRegVal);
2790 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2791 NonRegOp2.setTargetFlags(TargetFlags);
2792 return &MI;
2793}
2794
2795bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2796 unsigned OpIdx1) const {
2797 const MCInstrDesc &InstDesc = MI.getDesc();
2798 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2799 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2800
2801 unsigned Opc = MI.getOpcode();
2802 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2803
2804 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2805 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2806
2807 // Swap doesn't breach constant bus or literal limits
2808 // It may move literal to position other than src0, this is not allowed
2809 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2810 // FIXME: After gfx9, literal can be in place other than Src0
2811 if (isVALU(MI)) {
2812 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2813 !isInlineConstant(MO0, OpInfo1))
2814 return false;
2815 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2816 !isInlineConstant(MO1, OpInfo0))
2817 return false;
2818 }
2819
2820 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2821 if (OpInfo1.RegClass == -1)
2822 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2823 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2824 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2825 }
2826 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2827 if (OpInfo0.RegClass == -1)
2828 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2829 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2830 isLegalRegOperand(MI, OpIdx0, MO1);
2831 }
2832
2833 // No need to check 64-bit literals since swapping does not bring new
2834 // 64-bit literals into current instruction to fold to 32-bit
2835
2836 return isImmOperandLegal(MI, OpIdx1, MO0);
2837}
2838
2840 unsigned Src0Idx,
2841 unsigned Src1Idx) const {
2842 assert(!NewMI && "this should never be used");
2843
2844 unsigned Opc = MI.getOpcode();
2845 int CommutedOpcode = commuteOpcode(Opc);
2846 if (CommutedOpcode == -1)
2847 return nullptr;
2848
2849 if (Src0Idx > Src1Idx)
2850 std::swap(Src0Idx, Src1Idx);
2851
2852 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2853 static_cast<int>(Src0Idx) &&
2854 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2855 static_cast<int>(Src1Idx) &&
2856 "inconsistency with findCommutedOpIndices");
2857
2858 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2859 return nullptr;
2860
2861 MachineInstr *CommutedMI = nullptr;
2862 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2863 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2864 if (Src0.isReg() && Src1.isReg()) {
2865 // Be sure to copy the source modifiers to the right place.
2866 CommutedMI =
2867 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2868 } else if (Src0.isReg() && !Src1.isReg()) {
2869 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2870 } else if (!Src0.isReg() && Src1.isReg()) {
2871 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2872 } else if (Src0.isImm() && Src1.isImm()) {
2873 CommutedMI = swapImmOperands(MI, Src0, Src1);
2874 } else {
2875 // FIXME: Found two non registers to commute. This does happen.
2876 return nullptr;
2877 }
2878
2879 if (CommutedMI) {
2880 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2881 Src1, AMDGPU::OpName::src1_modifiers);
2882
2883 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2884 AMDGPU::OpName::src1_sel);
2885
2886 CommutedMI->setDesc(get(CommutedOpcode));
2887 }
2888
2889 return CommutedMI;
2890}
2891
2892// This needs to be implemented because the source modifiers may be inserted
2893// between the true commutable operands, and the base
2894// TargetInstrInfo::commuteInstruction uses it.
2896 unsigned &SrcOpIdx0,
2897 unsigned &SrcOpIdx1) const {
2898 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2899}
2900
2902 unsigned &SrcOpIdx0,
2903 unsigned &SrcOpIdx1) const {
2904 if (!Desc.isCommutable())
2905 return false;
2906
2907 unsigned Opc = Desc.getOpcode();
2908 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2909 if (Src0Idx == -1)
2910 return false;
2911
2912 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2913 if (Src1Idx == -1)
2914 return false;
2915
2916 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2917}
2918
2920 int64_t BrOffset) const {
2921 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2922 // because its dest block is unanalyzable.
2923 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2924
2925 // Convert to dwords.
2926 BrOffset /= 4;
2927
2928 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2929 // from the next instruction.
2930 BrOffset -= 1;
2931
2932 return isIntN(BranchOffsetBits, BrOffset);
2933}
2934
2937 return MI.getOperand(0).getMBB();
2938}
2939
2941 for (const MachineInstr &MI : MBB->terminators()) {
2942 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2943 MI.getOpcode() == AMDGPU::SI_LOOP)
2944 return true;
2945 }
2946 return false;
2947}
2948
2950 MachineBasicBlock &DestBB,
2951 MachineBasicBlock &RestoreBB,
2952 const DebugLoc &DL, int64_t BrOffset,
2953 RegScavenger *RS) const {
2954 assert(MBB.empty() &&
2955 "new block should be inserted for expanding unconditional branch");
2956 assert(MBB.pred_size() == 1);
2957 assert(RestoreBB.empty() &&
2958 "restore block should be inserted for restoring clobbered registers");
2959
2960 MachineFunction *MF = MBB.getParent();
2963 auto I = MBB.end();
2964 auto &MCCtx = MF->getContext();
2965
2966 if (ST.useAddPC64Inst()) {
2967 MCSymbol *Offset =
2968 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2969 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2971 MCSymbol *PostAddPCLabel =
2972 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2973 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2974 auto *OffsetExpr = MCBinaryExpr::createSub(
2975 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2976 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2977 Offset->setVariableValue(OffsetExpr);
2978 return;
2979 }
2980
2981 assert(RS && "RegScavenger required for long branching");
2982
2983 // FIXME: Virtual register workaround for RegScavenger not working with empty
2984 // blocks.
2985 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2986
2987 // Note: as this is used after hazard recognizer we need to apply some hazard
2988 // workarounds directly.
2989 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2990 ST.hasVALUReadSGPRHazard();
2991 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2992 if (FlushSGPRWrites)
2993 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2995 };
2996
2997 // We need to compute the offset relative to the instruction immediately after
2998 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2999 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
3000 ApplyHazardWorkarounds();
3001
3002 MCSymbol *PostGetPCLabel =
3003 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
3004 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
3005
3006 MCSymbol *OffsetLo =
3007 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
3008 MCSymbol *OffsetHi =
3009 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
3010 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
3011 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
3012 .addReg(PCReg, {}, AMDGPU::sub0)
3013 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
3014 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
3015 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
3016 .addReg(PCReg, {}, AMDGPU::sub1)
3017 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
3018 ApplyHazardWorkarounds();
3019
3020 // Insert the indirect branch after the other terminator.
3021 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
3022 .addReg(PCReg);
3023
3024 // If a spill is needed for the pc register pair, we need to insert a spill
3025 // restore block right before the destination block, and insert a short branch
3026 // into the old destination block's fallthrough predecessor.
3027 // e.g.:
3028 //
3029 // s_cbranch_scc0 skip_long_branch:
3030 //
3031 // long_branch_bb:
3032 // spill s[8:9]
3033 // s_getpc_b64 s[8:9]
3034 // s_add_u32 s8, s8, restore_bb
3035 // s_addc_u32 s9, s9, 0
3036 // s_setpc_b64 s[8:9]
3037 //
3038 // skip_long_branch:
3039 // foo;
3040 //
3041 // .....
3042 //
3043 // dest_bb_fallthrough_predecessor:
3044 // bar;
3045 // s_branch dest_bb
3046 //
3047 // restore_bb:
3048 // restore s[8:9]
3049 // fallthrough dest_bb
3050 ///
3051 // dest_bb:
3052 // buzz;
3053
3054 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3055 Register Scav;
3056
3057 // If we've previously reserved a register for long branches
3058 // avoid running the scavenger and just use those registers
3059 if (LongBranchReservedReg) {
3060 RS->enterBasicBlock(MBB);
3061 Scav = LongBranchReservedReg;
3062 } else {
3063 RS->enterBasicBlockEnd(MBB);
3064 Scav = RS->scavengeRegisterBackwards(
3065 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3066 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3067 }
3068 if (Scav) {
3069 RS->setRegUsed(Scav);
3070 MRI.replaceRegWith(PCReg, Scav);
3071 MRI.clearVirtRegs();
3072 } else {
3073 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3074 // SGPR spill.
3075 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3076 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3077 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3078 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3079 MRI.clearVirtRegs();
3080 }
3081
3082 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3083 // Now, the distance could be defined.
3085 MCSymbolRefExpr::create(DestLabel, MCCtx),
3086 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3087 // Add offset assignments.
3088 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3089 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3090 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3091 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3092}
3093
3094unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3095 switch (Cond) {
3096 case SIInstrInfo::SCC_TRUE:
3097 return AMDGPU::S_CBRANCH_SCC1;
3098 case SIInstrInfo::SCC_FALSE:
3099 return AMDGPU::S_CBRANCH_SCC0;
3100 case SIInstrInfo::VCCNZ:
3101 return AMDGPU::S_CBRANCH_VCCNZ;
3102 case SIInstrInfo::VCCZ:
3103 return AMDGPU::S_CBRANCH_VCCZ;
3104 case SIInstrInfo::EXECNZ:
3105 return AMDGPU::S_CBRANCH_EXECNZ;
3106 case SIInstrInfo::EXECZ:
3107 return AMDGPU::S_CBRANCH_EXECZ;
3108 default:
3109 llvm_unreachable("invalid branch predicate");
3110 }
3111}
3112
3113SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3114 switch (Opcode) {
3115 case AMDGPU::S_CBRANCH_SCC0:
3116 return SCC_FALSE;
3117 case AMDGPU::S_CBRANCH_SCC1:
3118 return SCC_TRUE;
3119 case AMDGPU::S_CBRANCH_VCCNZ:
3120 return VCCNZ;
3121 case AMDGPU::S_CBRANCH_VCCZ:
3122 return VCCZ;
3123 case AMDGPU::S_CBRANCH_EXECNZ:
3124 return EXECNZ;
3125 case AMDGPU::S_CBRANCH_EXECZ:
3126 return EXECZ;
3127 default:
3128 return INVALID_BR;
3129 }
3130}
3131
3135 MachineBasicBlock *&FBB,
3137 bool AllowModify) const {
3138 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3139 // Unconditional Branch
3140 TBB = I->getOperand(0).getMBB();
3141 return false;
3142 }
3143
3144 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3145 if (Pred == INVALID_BR)
3146 return true;
3147
3148 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3149 Cond.push_back(MachineOperand::CreateImm(Pred));
3150 Cond.push_back(I->getOperand(1)); // Save the branch register.
3151
3152 ++I;
3153
3154 if (I == MBB.end()) {
3155 // Conditional branch followed by fall-through.
3156 TBB = CondBB;
3157 return false;
3158 }
3159
3160 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3161 TBB = CondBB;
3162 FBB = I->getOperand(0).getMBB();
3163 return false;
3164 }
3165
3166 return true;
3167}
3168
3170 MachineBasicBlock *&FBB,
3172 bool AllowModify) const {
3173 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3174 auto E = MBB.end();
3175 if (I == E)
3176 return false;
3177
3178 // Skip over the instructions that are artificially terminators for special
3179 // exec management.
3180 while (I != E && !I->isBranch() && !I->isReturn()) {
3181 switch (I->getOpcode()) {
3182 case AMDGPU::S_MOV_B64_term:
3183 case AMDGPU::S_XOR_B64_term:
3184 case AMDGPU::S_OR_B64_term:
3185 case AMDGPU::S_ANDN2_B64_term:
3186 case AMDGPU::S_AND_B64_term:
3187 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3188 case AMDGPU::S_MOV_B32_term:
3189 case AMDGPU::S_XOR_B32_term:
3190 case AMDGPU::S_OR_B32_term:
3191 case AMDGPU::S_ANDN2_B32_term:
3192 case AMDGPU::S_AND_B32_term:
3193 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3194 break;
3195 case AMDGPU::SI_IF:
3196 case AMDGPU::SI_ELSE:
3197 case AMDGPU::SI_KILL_I1_TERMINATOR:
3198 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3199 // FIXME: It's messy that these need to be considered here at all.
3200 return true;
3201 default:
3202 llvm_unreachable("unexpected non-branch terminator inst");
3203 }
3204
3205 ++I;
3206 }
3207
3208 if (I == E)
3209 return false;
3210
3211 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3212}
3213
3215 int *BytesRemoved) const {
3216 unsigned Count = 0;
3217 unsigned RemovedSize = 0;
3218 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3219 // Skip over artificial terminators when removing instructions.
3220 if (MI.isBranch() || MI.isReturn()) {
3221 RemovedSize += getInstSizeInBytes(MI);
3222 MI.eraseFromParent();
3223 ++Count;
3224 }
3225 }
3226
3227 if (BytesRemoved)
3228 *BytesRemoved = RemovedSize;
3229
3230 return Count;
3231}
3232
3233// Copy the flags onto the implicit condition register operand.
3235 const MachineOperand &OrigCond) {
3236 CondReg.setIsUndef(OrigCond.isUndef());
3237 CondReg.setIsKill(OrigCond.isKill());
3238}
3239
3242 MachineBasicBlock *FBB,
3244 const DebugLoc &DL,
3245 int *BytesAdded) const {
3246 if (!FBB && Cond.empty()) {
3247 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3248 .addMBB(TBB);
3249 if (BytesAdded)
3250 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3251 return 1;
3252 }
3253
3254 assert(TBB && Cond[0].isImm());
3255
3256 unsigned Opcode
3257 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3258
3259 if (!FBB) {
3260 MachineInstr *CondBr =
3261 BuildMI(&MBB, DL, get(Opcode))
3262 .addMBB(TBB);
3263
3264 // Copy the flags onto the implicit condition register operand.
3265 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3266 fixImplicitOperands(*CondBr);
3267
3268 if (BytesAdded)
3269 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3270 return 1;
3271 }
3272
3273 assert(TBB && FBB);
3274
3275 MachineInstr *CondBr =
3276 BuildMI(&MBB, DL, get(Opcode))
3277 .addMBB(TBB);
3278 fixImplicitOperands(*CondBr);
3279 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3280 .addMBB(FBB);
3281
3282 MachineOperand &CondReg = CondBr->getOperand(1);
3283 CondReg.setIsUndef(Cond[1].isUndef());
3284 CondReg.setIsKill(Cond[1].isKill());
3285
3286 if (BytesAdded)
3287 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3288
3289 return 2;
3290}
3291
3294 if (Cond.size() != 2) {
3295 return true;
3296 }
3297
3298 if (Cond[0].isImm()) {
3299 Cond[0].setImm(-Cond[0].getImm());
3300 return false;
3301 }
3302
3303 return true;
3304}
3305
3308 Register DstReg, Register TrueReg,
3309 Register FalseReg, int &CondCycles,
3310 int &TrueCycles, int &FalseCycles) const {
3311 switch (Cond[0].getImm()) {
3312 case VCCNZ:
3313 case VCCZ: {
3314 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3315 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3316 if (MRI.getRegClass(FalseReg) != RC)
3317 return false;
3318
3319 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3320 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3321
3322 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3323 return RI.hasVGPRs(RC) && NumInsts <= 6;
3324 }
3325 case SCC_TRUE:
3326 case SCC_FALSE: {
3327 // FIXME: We could insert for VGPRs if we could replace the original compare
3328 // with a vector one.
3329 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3330 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3331 if (MRI.getRegClass(FalseReg) != RC)
3332 return false;
3333
3334 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3335
3336 // Multiples of 8 can do s_cselect_b64
3337 if (NumInsts % 2 == 0)
3338 NumInsts /= 2;
3339
3340 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3341 return RI.isSGPRClass(RC);
3342 }
3343 default:
3344 return false;
3345 }
3346}
3347
3351 Register TrueReg, Register FalseReg) const {
3352 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3353 if (Pred == VCCZ || Pred == SCC_FALSE) {
3354 Pred = static_cast<BranchPredicate>(-Pred);
3355 std::swap(TrueReg, FalseReg);
3356 }
3357
3358 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3359 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3360 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3361
3362 if (DstSize == 32) {
3364 if (Pred == SCC_TRUE) {
3365 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3366 .addReg(TrueReg)
3367 .addReg(FalseReg);
3368 } else {
3369 // Instruction's operands are backwards from what is expected.
3370 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3371 .addReg(FalseReg)
3372 .addReg(TrueReg);
3373 }
3374
3375 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3376 return;
3377 }
3378
3379 if (DstSize == 64 && Pred == SCC_TRUE) {
3381 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3382 .addReg(TrueReg)
3383 .addReg(FalseReg);
3384
3385 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3386 return;
3387 }
3388
3389 static const int16_t Sub0_15[] = {
3390 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3391 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3392 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3393 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3394 };
3395
3396 static const int16_t Sub0_15_64[] = {
3397 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3398 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3399 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3400 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3401 };
3402
3403 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3404 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3405 const int16_t *SubIndices = Sub0_15;
3406 int NElts = DstSize / 32;
3407
3408 // 64-bit select is only available for SALU.
3409 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3410 if (Pred == SCC_TRUE) {
3411 if (NElts % 2) {
3412 SelOp = AMDGPU::S_CSELECT_B32;
3413 EltRC = &AMDGPU::SGPR_32RegClass;
3414 } else {
3415 SelOp = AMDGPU::S_CSELECT_B64;
3416 EltRC = &AMDGPU::SGPR_64RegClass;
3417 SubIndices = Sub0_15_64;
3418 NElts /= 2;
3419 }
3420 }
3421
3423 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3424
3425 I = MIB->getIterator();
3426
3428 for (int Idx = 0; Idx != NElts; ++Idx) {
3429 Register DstElt = MRI.createVirtualRegister(EltRC);
3430 Regs.push_back(DstElt);
3431
3432 unsigned SubIdx = SubIndices[Idx];
3433
3435 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3436 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3437 .addReg(FalseReg, {}, SubIdx)
3438 .addReg(TrueReg, {}, SubIdx);
3439 } else {
3440 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3441 .addReg(TrueReg, {}, SubIdx)
3442 .addReg(FalseReg, {}, SubIdx);
3443 }
3444
3445 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3447
3448 MIB.addReg(DstElt)
3449 .addImm(SubIdx);
3450 }
3451}
3452
3454 switch (MI.getOpcode()) {
3455 case AMDGPU::V_MOV_B16_t16_e32:
3456 case AMDGPU::V_MOV_B16_t16_e64:
3457 case AMDGPU::V_MOV_B32_e32:
3458 case AMDGPU::V_MOV_B32_e64:
3459 case AMDGPU::V_MOV_B64_PSEUDO:
3460 case AMDGPU::V_MOV_B64_e32:
3461 case AMDGPU::V_MOV_B64_e64:
3462 case AMDGPU::S_MOV_B32:
3463 case AMDGPU::S_MOV_B64:
3464 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3465 case AMDGPU::COPY:
3466 case AMDGPU::WWM_COPY:
3467 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3468 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3469 case AMDGPU::V_ACCVGPR_MOV_B32:
3470 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3471 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3472 return true;
3473 default:
3474 return false;
3475 }
3476}
3477
3479 switch (MI.getOpcode()) {
3480 case AMDGPU::V_MOV_B16_t16_e32:
3481 case AMDGPU::V_MOV_B16_t16_e64:
3482 return 2;
3483 case AMDGPU::V_MOV_B32_e32:
3484 case AMDGPU::V_MOV_B32_e64:
3485 case AMDGPU::V_MOV_B64_PSEUDO:
3486 case AMDGPU::V_MOV_B64_e32:
3487 case AMDGPU::V_MOV_B64_e64:
3488 case AMDGPU::S_MOV_B32:
3489 case AMDGPU::S_MOV_B64:
3490 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3491 case AMDGPU::COPY:
3492 case AMDGPU::WWM_COPY:
3493 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3494 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3495 case AMDGPU::V_ACCVGPR_MOV_B32:
3496 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3497 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3498 return 1;
3499 default:
3500 llvm_unreachable("MI is not a foldable copy");
3501 }
3502}
3503
3504static constexpr AMDGPU::OpName ModifierOpNames[] = {
3505 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3506 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3507 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3508
3510 unsigned Opc = MI.getOpcode();
3511 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3512 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3513 if (Idx >= 0)
3514 MI.removeOperand(Idx);
3515 }
3516}
3517
3519 const MCInstrDesc &NewDesc) const {
3520 MI.setDesc(NewDesc);
3521
3522 // Remove any leftover implicit operands from mutating the instruction. e.g.
3523 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3524 // anymore.
3525 const MCInstrDesc &Desc = MI.getDesc();
3526 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3527 Desc.implicit_defs().size();
3528
3529 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3530 MI.removeOperand(I);
3531}
3532
3533std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3534 unsigned SubRegIndex) {
3535 switch (SubRegIndex) {
3536 case AMDGPU::NoSubRegister:
3537 return Imm;
3538 case AMDGPU::sub0:
3539 return SignExtend64<32>(Imm);
3540 case AMDGPU::sub1:
3541 return SignExtend64<32>(Imm >> 32);
3542 case AMDGPU::lo16:
3543 return SignExtend64<16>(Imm);
3544 case AMDGPU::hi16:
3545 return SignExtend64<16>(Imm >> 16);
3546 case AMDGPU::sub1_lo16:
3547 return SignExtend64<16>(Imm >> 32);
3548 case AMDGPU::sub1_hi16:
3549 return SignExtend64<16>(Imm >> 48);
3550 default:
3551 return std::nullopt;
3552 }
3553
3554 llvm_unreachable("covered subregister switch");
3555}
3556
3557static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3558 switch (Opc) {
3559 case AMDGPU::V_MAC_F16_e32:
3560 case AMDGPU::V_MAC_F16_e64:
3561 case AMDGPU::V_MAD_F16_e64:
3562 return AMDGPU::V_MADAK_F16;
3563 case AMDGPU::V_MAC_F32_e32:
3564 case AMDGPU::V_MAC_F32_e64:
3565 case AMDGPU::V_MAD_F32_e64:
3566 return AMDGPU::V_MADAK_F32;
3567 case AMDGPU::V_FMAC_F32_e32:
3568 case AMDGPU::V_FMAC_F32_e64:
3569 case AMDGPU::V_FMA_F32_e64:
3570 return AMDGPU::V_FMAAK_F32;
3571 case AMDGPU::V_FMAC_F16_e32:
3572 case AMDGPU::V_FMAC_F16_e64:
3573 case AMDGPU::V_FMAC_F16_t16_e64:
3574 case AMDGPU::V_FMAC_F16_fake16_e64:
3575 case AMDGPU::V_FMAC_F16_t16_e32:
3576 case AMDGPU::V_FMAC_F16_fake16_e32:
3577 case AMDGPU::V_FMA_F16_e64:
3578 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3579 ? AMDGPU::V_FMAAK_F16_t16
3580 : AMDGPU::V_FMAAK_F16_fake16
3581 : AMDGPU::V_FMAAK_F16;
3582 case AMDGPU::V_FMAC_F64_e32:
3583 case AMDGPU::V_FMAC_F64_e64:
3584 case AMDGPU::V_FMA_F64_e64:
3585 return AMDGPU::V_FMAAK_F64;
3586 default:
3587 llvm_unreachable("invalid instruction");
3588 }
3589}
3590
3591static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3592 switch (Opc) {
3593 case AMDGPU::V_MAC_F16_e32:
3594 case AMDGPU::V_MAC_F16_e64:
3595 case AMDGPU::V_MAD_F16_e64:
3596 return AMDGPU::V_MADMK_F16;
3597 case AMDGPU::V_MAC_F32_e32:
3598 case AMDGPU::V_MAC_F32_e64:
3599 case AMDGPU::V_MAD_F32_e64:
3600 return AMDGPU::V_MADMK_F32;
3601 case AMDGPU::V_FMAC_F32_e32:
3602 case AMDGPU::V_FMAC_F32_e64:
3603 case AMDGPU::V_FMA_F32_e64:
3604 return AMDGPU::V_FMAMK_F32;
3605 case AMDGPU::V_FMAC_F16_e32:
3606 case AMDGPU::V_FMAC_F16_e64:
3607 case AMDGPU::V_FMAC_F16_t16_e64:
3608 case AMDGPU::V_FMAC_F16_fake16_e64:
3609 case AMDGPU::V_FMAC_F16_t16_e32:
3610 case AMDGPU::V_FMAC_F16_fake16_e32:
3611 case AMDGPU::V_FMA_F16_e64:
3612 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3613 ? AMDGPU::V_FMAMK_F16_t16
3614 : AMDGPU::V_FMAMK_F16_fake16
3615 : AMDGPU::V_FMAMK_F16;
3616 case AMDGPU::V_FMAC_F64_e32:
3617 case AMDGPU::V_FMAC_F64_e64:
3618 case AMDGPU::V_FMA_F64_e64:
3619 return AMDGPU::V_FMAMK_F64;
3620 default:
3621 llvm_unreachable("invalid instruction");
3622 }
3623}
3624
3626 Register Reg, MachineRegisterInfo *MRI) const {
3627 int64_t Imm;
3628 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3629 return false;
3630
3631 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3632
3633 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3634
3635 unsigned Opc = UseMI.getOpcode();
3636 if (Opc == AMDGPU::COPY) {
3637 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3638
3639 Register DstReg = UseMI.getOperand(0).getReg();
3640 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3641
3642 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3643
3644 if (HasMultipleUses) {
3645 // TODO: This should fold in more cases with multiple use, but we need to
3646 // more carefully consider what those uses are.
3647 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3648
3649 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3650 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3651 return false;
3652
3653 // Most of the time folding a 32-bit inline constant is free (though this
3654 // might not be true if we can't later fold it into a real user).
3655 //
3656 // FIXME: This isInlineConstant check is imprecise if
3657 // getConstValDefinedInReg handled the tricky non-mov cases.
3658 if (ImmDefSize == 32 &&
3660 return false;
3661 }
3662
3663 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3664 RI.getSubRegIdxSize(UseSubReg) == 16;
3665
3666 if (Is16Bit) {
3667 if (RI.hasVGPRs(DstRC))
3668 return false; // Do not clobber vgpr_hi16
3669
3670 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3671 return false;
3672 }
3673
3674 MachineFunction *MF = UseMI.getMF();
3675
3676 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3677 MCRegister MovDstPhysReg =
3678 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3679
3680 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3681
3682 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3683 for (unsigned MovOp :
3684 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3685 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3686 const MCInstrDesc &MovDesc = get(MovOp);
3687
3688 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3689 if (Is16Bit) {
3690 // We just need to find a correctly sized register class, so the
3691 // subregister index compatibility doesn't matter since we're statically
3692 // extracting the immediate value.
3693 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3694 if (!MovDstRC)
3695 continue;
3696
3697 if (MovDstPhysReg) {
3698 // FIXME: We probably should not do this. If there is a live value in
3699 // the high half of the register, it will be corrupted.
3700 MovDstPhysReg =
3701 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3702 if (!MovDstPhysReg)
3703 continue;
3704 }
3705 }
3706
3707 // Result class isn't the right size, try the next instruction.
3708 if (MovDstPhysReg) {
3709 if (!MovDstRC->contains(MovDstPhysReg))
3710 return false;
3711 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3712 // TODO: This will be overly conservative in the case of 16-bit virtual
3713 // SGPRs. We could hack up the virtual register uses to use a compatible
3714 // 32-bit class.
3715 continue;
3716 }
3717
3718 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3719
3720 // Ensure the interpreted immediate value is a valid operand in the new
3721 // mov.
3722 //
3723 // FIXME: isImmOperandLegal should have form that doesn't require existing
3724 // MachineInstr or MachineOperand
3725 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3726 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3727 break;
3728
3729 NewOpc = MovOp;
3730 break;
3731 }
3732
3733 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3734 return false;
3735
3736 if (Is16Bit) {
3737 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3738 if (MovDstPhysReg)
3739 UseMI.getOperand(0).setReg(MovDstPhysReg);
3740 assert(UseMI.getOperand(1).getReg().isVirtual());
3741 }
3742
3743 const MCInstrDesc &NewMCID = get(NewOpc);
3744 UseMI.setDesc(NewMCID);
3745 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3746 UseMI.addImplicitDefUseOperands(*MF);
3747 return true;
3748 }
3749
3750 if (HasMultipleUses)
3751 return false;
3752
3753 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3754 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3755 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3756 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3757 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3758 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3759 Opc == AMDGPU::V_FMAC_F64_e64) {
3760 // Don't fold if we are using source or output modifiers. The new VOP2
3761 // instructions don't have them.
3763 return false;
3764
3765 // If this is a free constant, there's no reason to do this.
3766 // TODO: We could fold this here instead of letting SIFoldOperands do it
3767 // later.
3768 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3769
3770 // Any src operand can be used for the legality check.
3771 if (isInlineConstant(UseMI, Src0Idx, Imm))
3772 return false;
3773
3774 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3775
3776 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3777 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3778
3779 auto CopyRegOperandToNarrowerRC =
3780 [MRI, this](MachineInstr &MI, unsigned OpNo,
3781 const TargetRegisterClass *NewRC) -> void {
3782 if (!MI.getOperand(OpNo).isReg())
3783 return;
3784 Register Reg = MI.getOperand(OpNo).getReg();
3785 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3786 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3787 return;
3788 Register Tmp = MRI->createVirtualRegister(NewRC);
3789 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3790 get(AMDGPU::COPY), Tmp)
3791 .addReg(Reg);
3792 MI.getOperand(OpNo).setReg(Tmp);
3793 MI.getOperand(OpNo).setIsKill();
3794 };
3795
3796 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3797 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3798 (Src1->isReg() && Src1->getReg() == Reg)) {
3799 MachineOperand *RegSrc =
3800 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3801 if (!RegSrc->isReg())
3802 return false;
3803 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3804 ST.getConstantBusLimit(Opc) < 2)
3805 return false;
3806
3807 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3808 return false;
3809
3810 // If src2 is also a literal constant then we have to choose which one to
3811 // fold. In general it is better to choose madak so that the other literal
3812 // can be materialized in an sgpr instead of a vgpr:
3813 // s_mov_b32 s0, literal
3814 // v_madak_f32 v0, s0, v0, literal
3815 // Instead of:
3816 // v_mov_b32 v1, literal
3817 // v_madmk_f32 v0, v0, literal, v1
3818 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3819 if (Def && Def->isMoveImmediate() &&
3820 !isInlineConstant(Def->getOperand(1)))
3821 return false;
3822
3823 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3824 if (pseudoToMCOpcode(NewOpc) == -1)
3825 return false;
3826
3827 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3828 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3829
3830 // FIXME: This would be a lot easier if we could return a new instruction
3831 // instead of having to modify in place.
3832
3833 Register SrcReg = RegSrc->getReg();
3834 unsigned SrcSubReg = RegSrc->getSubReg();
3835 Src0->setReg(SrcReg);
3836 Src0->setSubReg(SrcSubReg);
3837 Src0->setIsKill(RegSrc->isKill());
3838
3839 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3840 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3841 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3842 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3843 UseMI.untieRegOperand(
3844 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3845
3846 Src1->ChangeToImmediate(*SubRegImm);
3847
3849 UseMI.setDesc(get(NewOpc));
3850
3851 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3852 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3853 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3854 Register Tmp = MRI->createVirtualRegister(NewRC);
3855 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3856 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3857 UseMI.getOperand(0).getReg())
3858 .addReg(Tmp, RegState::Kill);
3859 UseMI.getOperand(0).setReg(Tmp);
3860 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3861 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3862 }
3863
3864 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3865 if (DeleteDef)
3866 DefMI.eraseFromParent();
3867
3868 return true;
3869 }
3870
3871 // Added part is the constant: Use v_madak_{f16, f32}.
3872 if (Src2->isReg() && Src2->getReg() == Reg) {
3873 if (ST.getConstantBusLimit(Opc) < 2) {
3874 // Not allowed to use constant bus for another operand.
3875 // We can however allow an inline immediate as src0.
3876 bool Src0Inlined = false;
3877 if (Src0->isReg()) {
3878 // Try to inline constant if possible.
3879 // If the Def moves immediate and the use is single
3880 // We are saving VGPR here.
3881 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3882 if (Def && Def->isMoveImmediate() &&
3883 isInlineConstant(Def->getOperand(1)) &&
3884 MRI->hasOneNonDBGUse(Src0->getReg())) {
3885 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3886 Src0Inlined = true;
3887 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3888 RI.isSGPRReg(*MRI, Src0->getReg())) {
3889 return false;
3890 }
3891 // VGPR is okay as Src0 - fallthrough
3892 }
3893
3894 if (Src1->isReg() && !Src0Inlined) {
3895 // We have one slot for inlinable constant so far - try to fill it
3896 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3897 if (Def && Def->isMoveImmediate() &&
3898 isInlineConstant(Def->getOperand(1)) &&
3899 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3900 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3901 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3902 return false;
3903 // VGPR is okay as Src1 - fallthrough
3904 }
3905 }
3906
3907 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3908 if (pseudoToMCOpcode(NewOpc) == -1)
3909 return false;
3910
3911 // FIXME: This would be a lot easier if we could return a new instruction
3912 // instead of having to modify in place.
3913
3914 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3915 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3916 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3917 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3918 UseMI.untieRegOperand(
3919 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3920
3921 const std::optional<int64_t> SubRegImm =
3922 extractSubregFromImm(Imm, Src2->getSubReg());
3923
3924 // ChangingToImmediate adds Src2 back to the instruction.
3925 Src2->ChangeToImmediate(*SubRegImm);
3926
3927 // These come before src2.
3929 UseMI.setDesc(get(NewOpc));
3930
3931 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3932 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3933 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3934 Register Tmp = MRI->createVirtualRegister(NewRC);
3935 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3936 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3937 UseMI.getOperand(0).getReg())
3938 .addReg(Tmp, RegState::Kill);
3939 UseMI.getOperand(0).setReg(Tmp);
3940 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3941 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
3942 }
3943
3944 // It might happen that UseMI was commuted
3945 // and we now have SGPR as SRC1. If so 2 inlined
3946 // constant and SGPR are illegal.
3948
3949 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3950 if (DeleteDef)
3951 DefMI.eraseFromParent();
3952
3953 return true;
3954 }
3955 }
3956
3957 return false;
3958}
3959
3960static bool
3963 if (BaseOps1.size() != BaseOps2.size())
3964 return false;
3965 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3966 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3967 return false;
3968 }
3969 return true;
3970}
3971
3972static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3973 LocationSize WidthB, int OffsetB) {
3974 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3975 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3976 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3977 return LowWidth.hasValue() &&
3978 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3979}
3980
3981bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3982 const MachineInstr &MIb) const {
3983 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3984 int64_t Offset0, Offset1;
3985 LocationSize Dummy0 = LocationSize::precise(0);
3986 LocationSize Dummy1 = LocationSize::precise(0);
3987 bool Offset0IsScalable, Offset1IsScalable;
3988 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3989 Dummy0, &RI) ||
3990 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3991 Dummy1, &RI))
3992 return false;
3993
3994 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3995 return false;
3996
3997 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3998 // FIXME: Handle ds_read2 / ds_write2.
3999 return false;
4000 }
4001 LocationSize Width0 = MIa.memoperands().front()->getSize();
4002 LocationSize Width1 = MIb.memoperands().front()->getSize();
4003 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
4004}
4005
4007 const MachineInstr &MIb) const {
4008 assert(MIa.mayLoadOrStore() &&
4009 "MIa must load from or modify a memory location");
4010 assert(MIb.mayLoadOrStore() &&
4011 "MIb must load from or modify a memory location");
4012
4014 return false;
4015
4016 // XXX - Can we relax this between address spaces?
4017 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4018 return false;
4019
4020 if (isLDSDMA(MIa) || isLDSDMA(MIb))
4021 return false;
4022
4023 if (MIa.isBundle() || MIb.isBundle())
4024 return false;
4025
4026 // TODO: Should we check the address space from the MachineMemOperand? That
4027 // would allow us to distinguish objects we know don't alias based on the
4028 // underlying address space, even if it was lowered to a different one,
4029 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4030 // buffer.
4031 if (isDS(MIa)) {
4032 if (isDS(MIb))
4033 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4034
4035 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4036 }
4037
4038 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4039 if (isMUBUF(MIb) || isMTBUF(MIb))
4040 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4041
4042 if (isFLAT(MIb))
4043 return isFLATScratch(MIb);
4044
4045 return !isSMRD(MIb);
4046 }
4047
4048 if (isSMRD(MIa)) {
4049 if (isSMRD(MIb))
4050 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4051
4052 if (isFLAT(MIb))
4053 return isFLATScratch(MIb);
4054
4055 return !isMUBUF(MIb) && !isMTBUF(MIb);
4056 }
4057
4058 if (isFLAT(MIa)) {
4059 if (isFLAT(MIb)) {
4060 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4061 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4062 return true;
4063
4064 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4065 }
4066
4067 return false;
4068 }
4069
4070 return false;
4071}
4072
4074 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4075 if (Reg.isPhysical())
4076 return false;
4077 auto *Def = MRI.getUniqueVRegDef(Reg);
4078 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4079 Imm = Def->getOperand(1).getImm();
4080 if (DefMI)
4081 *DefMI = Def;
4082 return true;
4083 }
4084 return false;
4085}
4086
4087static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4088 MachineInstr **DefMI = nullptr) {
4089 if (!MO->isReg())
4090 return false;
4091 const MachineFunction *MF = MO->getParent()->getMF();
4092 const MachineRegisterInfo &MRI = MF->getRegInfo();
4093 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4094}
4095
4097 MachineInstr &NewMI) {
4098 if (LV) {
4099 unsigned NumOps = MI.getNumOperands();
4100 for (unsigned I = 1; I < NumOps; ++I) {
4101 MachineOperand &Op = MI.getOperand(I);
4102 if (Op.isReg() && Op.isKill())
4103 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4104 }
4105 }
4106}
4107
4108static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4109 switch (Opc) {
4110 case AMDGPU::V_MAC_F16_e32:
4111 case AMDGPU::V_MAC_F16_e64:
4112 return AMDGPU::V_MAD_F16_e64;
4113 case AMDGPU::V_MAC_F32_e32:
4114 case AMDGPU::V_MAC_F32_e64:
4115 return AMDGPU::V_MAD_F32_e64;
4116 case AMDGPU::V_MAC_LEGACY_F32_e32:
4117 case AMDGPU::V_MAC_LEGACY_F32_e64:
4118 return AMDGPU::V_MAD_LEGACY_F32_e64;
4119 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4120 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4121 return AMDGPU::V_FMA_LEGACY_F32_e64;
4122 case AMDGPU::V_FMAC_F16_e32:
4123 case AMDGPU::V_FMAC_F16_e64:
4124 case AMDGPU::V_FMAC_F16_t16_e64:
4125 case AMDGPU::V_FMAC_F16_fake16_e64:
4126 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4127 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4128 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4129 : AMDGPU::V_FMA_F16_gfx9_e64;
4130 case AMDGPU::V_FMAC_F32_e32:
4131 case AMDGPU::V_FMAC_F32_e64:
4132 return AMDGPU::V_FMA_F32_e64;
4133 case AMDGPU::V_FMAC_F64_e32:
4134 case AMDGPU::V_FMAC_F64_e64:
4135 return AMDGPU::V_FMA_F64_e64;
4136 default:
4137 llvm_unreachable("invalid instruction");
4138 }
4139}
4140
4141/// Helper struct for the implementation of 3-address conversion to communicate
4142/// updates made to instruction operands.
4144 /// Other instruction whose def is no longer used by the converted
4145 /// instruction.
4147};
4148
4150 LiveVariables *LV,
4151 LiveIntervals *LIS) const {
4152 MachineBasicBlock &MBB = *MI.getParent();
4153 MachineInstr *CandidateMI = &MI;
4154
4155 if (MI.isBundle()) {
4156 // This is a temporary placeholder for bundle handling that enables us to
4157 // exercise the relevant code paths in the two-address instruction pass.
4158 if (MI.getBundleSize() != 1)
4159 return nullptr;
4160 CandidateMI = MI.getNextNode();
4161 }
4162
4164 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4165 if (!NewMI)
4166 return nullptr;
4167
4168 if (MI.isBundle()) {
4169 CandidateMI->eraseFromBundle();
4170
4171 for (MachineOperand &MO : MI.all_defs()) {
4172 if (MO.isTied())
4173 MI.untieRegOperand(MO.getOperandNo());
4174 }
4175 } else {
4176 updateLiveVariables(LV, MI, *NewMI);
4177 if (LIS) {
4178 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4179 // SlotIndex of defs needs to be updated when converting to early-clobber
4180 MachineOperand &Def = NewMI->getOperand(0);
4181 if (Def.isEarlyClobber() && Def.isReg() &&
4182 LIS->hasInterval(Def.getReg())) {
4183 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4184 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4185 auto &LI = LIS->getInterval(Def.getReg());
4186 auto UpdateDefIndex = [&](LiveRange &LR) {
4187 auto *S = LR.find(OldIndex);
4188 if (S != LR.end() && S->start == OldIndex) {
4189 assert(S->valno && S->valno->def == OldIndex);
4190 S->start = NewIndex;
4191 S->valno->def = NewIndex;
4192 }
4193 };
4194 UpdateDefIndex(LI);
4195 for (auto &SR : LI.subranges())
4196 UpdateDefIndex(SR);
4197 }
4198 }
4199 }
4200
4201 if (U.RemoveMIUse) {
4202 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4203 // The only user is the instruction which will be killed.
4204 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4205
4206 if (MRI.hasOneNonDBGUse(DefReg)) {
4207 // We cannot just remove the DefMI here, calling pass will crash.
4208 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4209 U.RemoveMIUse->getOperand(0).setIsDead(true);
4210 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4211 U.RemoveMIUse->removeOperand(I);
4212 if (LV)
4213 LV->getVarInfo(DefReg).AliveBlocks.clear();
4214 }
4215
4216 if (MI.isBundle()) {
4217 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4218 if (!VRI.Reads && !VRI.Writes) {
4219 for (MachineOperand &MO : MI.all_uses()) {
4220 if (MO.isReg() && MO.getReg() == DefReg) {
4221 assert(MO.getSubReg() == 0 &&
4222 "tied sub-registers in bundles currently not supported");
4223 MI.removeOperand(MO.getOperandNo());
4224 break;
4225 }
4226 }
4227
4228 if (LIS)
4229 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4230 }
4231 } else if (LIS) {
4232 LiveInterval &DefLI = LIS->getInterval(DefReg);
4233
4234 // We cannot delete the original instruction here, so hack out the use
4235 // in the original instruction with a dummy register so we can use
4236 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4237 // not have the complexity of deleting a use to consider here.
4238 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4239 for (MachineOperand &MIOp : MI.uses()) {
4240 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4241 MIOp.setIsUndef(true);
4242 MIOp.setReg(DummyReg);
4243 }
4244 }
4245
4246 if (MI.isBundle()) {
4247 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4248 if (!VRI.Reads && !VRI.Writes) {
4249 for (MachineOperand &MIOp : MI.uses()) {
4250 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4251 MIOp.setIsUndef(true);
4252 MIOp.setReg(DummyReg);
4253 }
4254 }
4255 }
4256
4257 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4258 false, /*isUndef=*/true));
4259 }
4260
4261 LIS->shrinkToUses(&DefLI);
4262 }
4263 }
4264
4265 return MI.isBundle() ? &MI : NewMI;
4266}
4267
4269SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4270 ThreeAddressUpdates &U) const {
4271 MachineBasicBlock &MBB = *MI.getParent();
4272 unsigned Opc = MI.getOpcode();
4273
4274 // Handle MFMA.
4275 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4276 if (NewMFMAOpc != -1) {
4278 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4279 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4280 MIB.add(MI.getOperand(I));
4281 return MIB;
4282 }
4283
4284 if (SIInstrInfo::isWMMA(MI)) {
4285 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4286 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4287 .setMIFlags(MI.getFlags());
4288 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4289 MIB->addOperand(MI.getOperand(I));
4290 return MIB;
4291 }
4292
4293 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4294 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4295 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4296 "present pre-RA");
4297
4298 // Handle MAC/FMAC.
4299 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4300 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4301 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4302 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4303 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4304 bool Src0Literal = false;
4305
4306 switch (Opc) {
4307 default:
4308 return nullptr;
4309 case AMDGPU::V_MAC_F16_e64:
4310 case AMDGPU::V_FMAC_F16_e64:
4311 case AMDGPU::V_FMAC_F16_t16_e64:
4312 case AMDGPU::V_FMAC_F16_fake16_e64:
4313 case AMDGPU::V_MAC_F32_e64:
4314 case AMDGPU::V_MAC_LEGACY_F32_e64:
4315 case AMDGPU::V_FMAC_F32_e64:
4316 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4317 case AMDGPU::V_FMAC_F64_e64:
4318 break;
4319 case AMDGPU::V_MAC_F16_e32:
4320 case AMDGPU::V_FMAC_F16_e32:
4321 case AMDGPU::V_MAC_F32_e32:
4322 case AMDGPU::V_MAC_LEGACY_F32_e32:
4323 case AMDGPU::V_FMAC_F32_e32:
4324 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4325 case AMDGPU::V_FMAC_F64_e32: {
4326 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4327 AMDGPU::OpName::src0);
4328 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4329 if (!Src0->isReg() && !Src0->isImm())
4330 return nullptr;
4331
4332 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4333 Src0Literal = true;
4334
4335 break;
4336 }
4337 }
4338
4339 MachineInstrBuilder MIB;
4340 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4341 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4342 const MachineOperand *Src0Mods =
4343 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4344 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4345 const MachineOperand *Src1Mods =
4346 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4347 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4348 const MachineOperand *Src2Mods =
4349 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4350 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4351 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4352 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4353
4354 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4355 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4356 // If we have an SGPR input, we will violate the constant bus restriction.
4357 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4358 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4359 MachineInstr *DefMI;
4360
4361 int64_t Imm;
4362 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4363 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4364 if (pseudoToMCOpcode(NewOpc) != -1) {
4365 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4366 .add(*Dst)
4367 .add(*Src0)
4368 .add(*Src1)
4369 .addImm(Imm)
4370 .setMIFlags(MI.getFlags());
4371 U.RemoveMIUse = DefMI;
4372 return MIB;
4373 }
4374 }
4375 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4376 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4377 if (pseudoToMCOpcode(NewOpc) != -1) {
4378 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4379 .add(*Dst)
4380 .add(*Src0)
4381 .addImm(Imm)
4382 .add(*Src2)
4383 .setMIFlags(MI.getFlags());
4384 U.RemoveMIUse = DefMI;
4385 return MIB;
4386 }
4387 }
4388 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4389 if (Src0Literal) {
4390 Imm = Src0->getImm();
4391 DefMI = nullptr;
4392 }
4393 if (pseudoToMCOpcode(NewOpc) != -1 &&
4395 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4396 Src1)) {
4397 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4398 .add(*Dst)
4399 .add(*Src1)
4400 .addImm(Imm)
4401 .add(*Src2)
4402 .setMIFlags(MI.getFlags());
4403 U.RemoveMIUse = DefMI;
4404 return MIB;
4405 }
4406 }
4407 }
4408
4409 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4410 // if VOP3 does not allow a literal operand.
4411 if (Src0Literal && !ST.hasVOP3Literal())
4412 return nullptr;
4413
4414 unsigned NewOpc = getNewFMAInst(ST, Opc);
4415
4416 if (pseudoToMCOpcode(NewOpc) == -1)
4417 return nullptr;
4418
4419 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4420 .add(*Dst)
4421 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4422 .add(*Src0)
4423 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4424 .add(*Src1)
4425 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4426 .add(*Src2)
4427 .addImm(Clamp ? Clamp->getImm() : 0)
4428 .addImm(Omod ? Omod->getImm() : 0)
4429 .setMIFlags(MI.getFlags());
4430 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4431 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4432 return MIB;
4433}
4434
4435// It's not generally safe to move VALU instructions across these since it will
4436// start using the register as a base index rather than directly.
4437// XXX - Why isn't hasSideEffects sufficient for these?
4439 switch (MI.getOpcode()) {
4440 case AMDGPU::S_SET_GPR_IDX_ON:
4441 case AMDGPU::S_SET_GPR_IDX_MODE:
4442 case AMDGPU::S_SET_GPR_IDX_OFF:
4443 return true;
4444 default:
4445 return false;
4446 }
4447}
4448
4450 const MachineBasicBlock *MBB,
4451 const MachineFunction &MF) const {
4452 // Skipping the check for SP writes in the base implementation. The reason it
4453 // was added was apparently due to compile time concerns.
4454 //
4455 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4456 // but is probably avoidable.
4457
4458 // Copied from base implementation.
4459 // Terminators and labels can't be scheduled around.
4460 if (MI.isTerminator() || MI.isPosition())
4461 return true;
4462
4463 // INLINEASM_BR can jump to another block
4464 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4465 return true;
4466
4467 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4468 return true;
4469
4470 // Target-independent instructions do not have an implicit-use of EXEC, even
4471 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4472 // boundaries prevents incorrect movements of such instructions.
4473 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4474 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4475 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4476 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4477 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4479}
4480
4482 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4483 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4484 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4485}
4486
4488 // Instructions that access scratch use FLAT encoding or BUF encodings.
4489 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4490 return false;
4491
4492 // SCRATCH instructions always access scratch.
4493 if (isFLATScratch(MI))
4494 return true;
4495
4496 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4497 // via the aperture.
4498 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4499 return false;
4500
4501 // If there are no memory operands then conservatively assume the flat
4502 // operation may access scratch.
4503 if (MI.memoperands_empty())
4504 return true;
4505
4506 // See if any memory operand specifies an address space that involves scratch.
4507 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4508 unsigned AS = Memop->getAddrSpace();
4509 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4510 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4511 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4512 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4513 }
4514 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4515 });
4516}
4517
4519 assert(isFLAT(MI));
4520
4521 // All flat instructions use the VMEM counter except prefetch.
4522 if (!usesVM_CNT(MI))
4523 return false;
4524
4525 // If there are no memory operands then conservatively assume the flat
4526 // operation may access VMEM.
4527 if (MI.memoperands_empty())
4528 return true;
4529
4530 // See if any memory operand specifies an address space that involves VMEM.
4531 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4532 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4533 // (GDS) address space is not supported by flat operations. Therefore, simply
4534 // return true unless only the LDS address space is found.
4535 for (const MachineMemOperand *Memop : MI.memoperands()) {
4536 unsigned AS = Memop->getAddrSpace();
4538 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4539 return true;
4540 }
4541
4542 return false;
4543}
4544
4546 assert(isFLAT(MI));
4547
4548 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4549 if (!usesLGKM_CNT(MI))
4550 return false;
4551
4552 // If in tgsplit mode then there can be no use of LDS.
4553 if (ST.isTgSplitEnabled())
4554 return false;
4555
4556 // If there are no memory operands then conservatively assume the flat
4557 // operation may access LDS.
4558 if (MI.memoperands_empty())
4559 return true;
4560
4561 // See if any memory operand specifies an address space that involves LDS.
4562 for (const MachineMemOperand *Memop : MI.memoperands()) {
4563 unsigned AS = Memop->getAddrSpace();
4565 return true;
4566 }
4567
4568 return false;
4569}
4570
4572 // Skip the full operand and register alias search modifiesRegister
4573 // does. There's only a handful of instructions that touch this, it's only an
4574 // implicit def, and doesn't alias any other registers.
4575 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4576}
4577
4579 unsigned Opcode = MI.getOpcode();
4580
4581 if (MI.mayStore() && isSMRD(MI))
4582 return true; // scalar store or atomic
4583
4584 // This will terminate the function when other lanes may need to continue.
4585 if (MI.isReturn())
4586 return true;
4587
4588 // These instructions cause shader I/O that may cause hardware lockups
4589 // when executed with an empty EXEC mask.
4590 //
4591 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4592 // EXEC = 0, but checking for that case here seems not worth it
4593 // given the typical code patterns.
4594 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4595 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4596 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4597 return true;
4598
4599 if (MI.isCall() || MI.isInlineAsm())
4600 return true; // conservative assumption
4601
4602 // Assume that barrier interactions are only intended with active lanes.
4603 if (isBarrier(Opcode))
4604 return true;
4605
4606 // A mode change is a scalar operation that influences vector instructions.
4608 return true;
4609
4610 // These are like SALU instructions in terms of effects, so it's questionable
4611 // whether we should return true for those.
4612 //
4613 // However, executing them with EXEC = 0 causes them to operate on undefined
4614 // data, which we avoid by returning true here.
4615 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4616 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4617 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4618 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4619 return true;
4620
4621 return false;
4622}
4623
4625 const MachineInstr &MI) const {
4626 if (MI.isMetaInstruction())
4627 return false;
4628
4629 // This won't read exec if this is an SGPR->SGPR copy.
4630 if (MI.isCopyLike()) {
4631 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4632 return true;
4633
4634 // Make sure this isn't copying exec as a normal operand
4635 return MI.readsRegister(AMDGPU::EXEC, &RI);
4636 }
4637
4638 // Make a conservative assumption about the callee.
4639 if (MI.isCall())
4640 return true;
4641
4642 // Be conservative with any unhandled generic opcodes.
4643 if (!isTargetSpecificOpcode(MI.getOpcode()))
4644 return true;
4645
4646 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4647}
4648
4649bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4650 switch (Imm.getBitWidth()) {
4651 case 1: // This likely will be a condition code mask.
4652 return true;
4653
4654 case 32:
4655 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4656 ST.hasInv2PiInlineImm());
4657 case 64:
4658 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4659 ST.hasInv2PiInlineImm());
4660 case 16:
4661 return ST.has16BitInsts() &&
4662 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4663 ST.hasInv2PiInlineImm());
4664 default:
4665 llvm_unreachable("invalid bitwidth");
4666 }
4667}
4668
4670 APInt IntImm = Imm.bitcastToAPInt();
4671 int64_t IntImmVal = IntImm.getSExtValue();
4672 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4673 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4674 default:
4675 llvm_unreachable("invalid fltSemantics");
4678 return isInlineConstant(IntImm);
4680 return ST.has16BitInsts() &&
4681 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4683 return ST.has16BitInsts() &&
4684 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4685 }
4686}
4687
4688bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4689 // MachineOperand provides no way to tell the true operand size, since it only
4690 // records a 64-bit value. We need to know the size to determine if a 32-bit
4691 // floating point immediate bit pattern is legal for an integer immediate. It
4692 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4693 switch (OperandType) {
4703 int32_t Trunc = static_cast<int32_t>(Imm);
4704 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4705 }
4711 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4714 // We would expect inline immediates to not be concerned with an integer/fp
4715 // distinction. However, in the case of 16-bit integer operations, the
4716 // "floating point" values appear to not work. It seems read the low 16-bits
4717 // of 32-bit immediates, which happens to always work for the integer
4718 // values.
4719 //
4720 // See llvm bugzilla 46302.
4721 //
4722 // TODO: Theoretically we could use op-sel to use the high bits of the
4723 // 32-bit FP values.
4732 return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
4737 return false;
4740 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4741 // A few special case instructions have 16-bit operands on subtargets
4742 // where 16-bit instructions are not legal.
4743 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4744 // constants in these cases
4745 int16_t Trunc = static_cast<int16_t>(Imm);
4746 return ST.has16BitInsts() &&
4747 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4748 }
4749
4750 return false;
4751 }
4754 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4755 int16_t Trunc = static_cast<int16_t>(Imm);
4756 return ST.has16BitInsts() &&
4757 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4758 }
4759 return false;
4760 }
4764 return false;
4766 return isLegalAV64PseudoImm(Imm);
4769 // Always embedded in the instruction for free.
4770 return true;
4780 // Just ignore anything else.
4781 return true;
4782 default:
4783 llvm_unreachable("invalid operand type");
4784 }
4785}
4786
4787static bool compareMachineOp(const MachineOperand &Op0,
4788 const MachineOperand &Op1) {
4789 if (Op0.getType() != Op1.getType())
4790 return false;
4791
4792 switch (Op0.getType()) {
4794 return Op0.getReg() == Op1.getReg();
4796 return Op0.getImm() == Op1.getImm();
4797 default:
4798 llvm_unreachable("Didn't expect to be comparing these operand types");
4799 }
4800}
4801
4803 const MCOperandInfo &OpInfo) const {
4804 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4805 return true;
4806
4807 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4808 return false;
4809
4810 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4811 return true;
4812
4813 return ST.hasVOP3Literal();
4814}
4815
4816bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4817 int64_t ImmVal) const {
4818 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4819 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4820 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4821 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4822 AMDGPU::OpName::src2))
4823 return false;
4824 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4825 }
4826
4827 return isLiteralOperandLegal(InstDesc, OpInfo);
4828}
4829
4830bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4831 const MachineOperand &MO) const {
4832 if (MO.isImm())
4833 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4834
4835 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4836 "unexpected imm-like operand kind");
4837 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4838 return isLiteralOperandLegal(InstDesc, OpInfo);
4839}
4840
4842 // 2 32-bit inline constants packed into one.
4843 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4844 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4845}
4846
4847bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4848 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4849 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4850 return false;
4851
4852 int Op32 = AMDGPU::getVOPe32(Opcode);
4853 if (Op32 == -1)
4854 return false;
4855
4856 return pseudoToMCOpcode(Op32) != -1;
4857}
4858
4859bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4860 // The src0_modifier operand is present on all instructions
4861 // that have modifiers.
4862
4863 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4864}
4865
4867 AMDGPU::OpName OpName) const {
4868 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4869 return Mods && Mods->getImm();
4870}
4871
4873 return any_of(ModifierOpNames,
4874 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4875}
4876
4878 const MachineRegisterInfo &MRI) const {
4879 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4880 // Can't shrink instruction with three operands.
4881 if (Src2) {
4882 switch (MI.getOpcode()) {
4883 default: return false;
4884
4885 case AMDGPU::V_ADDC_U32_e64:
4886 case AMDGPU::V_SUBB_U32_e64:
4887 case AMDGPU::V_SUBBREV_U32_e64: {
4888 const MachineOperand *Src1
4889 = getNamedOperand(MI, AMDGPU::OpName::src1);
4890 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4891 return false;
4892 // Additional verification is needed for sdst/src2.
4893 return true;
4894 }
4895 case AMDGPU::V_MAC_F16_e64:
4896 case AMDGPU::V_MAC_F32_e64:
4897 case AMDGPU::V_MAC_LEGACY_F32_e64:
4898 case AMDGPU::V_FMAC_F16_e64:
4899 case AMDGPU::V_FMAC_F16_t16_e64:
4900 case AMDGPU::V_FMAC_F16_fake16_e64:
4901 case AMDGPU::V_FMAC_F32_e64:
4902 case AMDGPU::V_FMAC_F64_e64:
4903 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4904 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4905 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4906 return false;
4907 break;
4908
4909 case AMDGPU::V_CNDMASK_B32_e64:
4910 break;
4911 }
4912 }
4913
4914 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4915 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4916 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4917 return false;
4918
4919 // We don't need to check src0, all input types are legal, so just make sure
4920 // src0 isn't using any modifiers.
4921 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4922 return false;
4923
4924 // Can it be shrunk to a valid 32 bit opcode?
4925 if (!hasVALU32BitEncoding(MI.getOpcode()))
4926 return false;
4927
4928 // Check output modifiers
4929 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4930 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4931 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4932 // TODO: Can we avoid checking bound_ctrl/fi here?
4933 // They are only used by permlane*_swap special case.
4934 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4935 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4936}
4937
4938// Set VCC operand with all flags from \p Orig, except for setting it as
4939// implicit.
4941 const MachineOperand &Orig) {
4942
4943 for (MachineOperand &Use : MI.implicit_operands()) {
4944 if (Use.isUse() &&
4945 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4946 Use.setIsUndef(Orig.isUndef());
4947 Use.setIsKill(Orig.isKill());
4948 return;
4949 }
4950 }
4951}
4952
4954 unsigned Op32) const {
4955 MachineBasicBlock *MBB = MI.getParent();
4956
4957 const MCInstrDesc &Op32Desc = get(Op32);
4958 MachineInstrBuilder Inst32 =
4959 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4960 .setMIFlags(MI.getFlags());
4961
4962 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4963 // For VOPC instructions, this is replaced by an implicit def of vcc.
4964
4965 // We assume the defs of the shrunk opcode are in the same order, and the
4966 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4967 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4968 Inst32.add(MI.getOperand(I));
4969
4970 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4971
4972 int Idx = MI.getNumExplicitDefs();
4973 for (const MachineOperand &Use : MI.explicit_uses()) {
4974 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4976 continue;
4977
4978 if (&Use == Src2) {
4979 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4980 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4981 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4982 // of vcc was already added during the initial BuildMI, but we
4983 // 1) may need to change vcc to vcc_lo to preserve the original register
4984 // 2) have to preserve the original flags.
4985 copyFlagsToImplicitVCC(*Inst32, *Src2);
4986 continue;
4987 }
4988 }
4989
4990 Inst32.add(Use);
4991 }
4992
4993 // FIXME: Losing implicit operands
4994 fixImplicitOperands(*Inst32);
4995 return Inst32;
4996}
4997
4999 // Null is free
5000 Register Reg = RegOp.getReg();
5001 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
5002 return false;
5003
5004 // SGPRs use the constant bus
5005
5006 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5007 // physical register operands should also count, except for exec.
5008 if (RegOp.isImplicit())
5009 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5010
5011 // SGPRs use the constant bus
5012 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5013 AMDGPU::SReg_64RegClass.contains(Reg);
5014}
5015
5017 const MachineRegisterInfo &MRI) const {
5018 Register Reg = RegOp.getReg();
5019 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5020 : physRegUsesConstantBus(RegOp);
5021}
5022
5024 const MachineOperand &MO,
5025 const MCOperandInfo &OpInfo) const {
5026 // Literal constants use the constant bus.
5027 if (!MO.isReg())
5028 return !isInlineConstant(MO, OpInfo);
5029
5030 Register Reg = MO.getReg();
5031 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5033}
5034
5036 for (const MachineOperand &MO : MI.implicit_operands()) {
5037 // We only care about reads.
5038 if (MO.isDef())
5039 continue;
5040
5041 switch (MO.getReg()) {
5042 case AMDGPU::VCC:
5043 case AMDGPU::VCC_LO:
5044 case AMDGPU::VCC_HI:
5045 case AMDGPU::M0:
5046 case AMDGPU::FLAT_SCR:
5047 return MO.getReg();
5048
5049 default:
5050 break;
5051 }
5052 }
5053
5054 return Register();
5055}
5056
5057static bool shouldReadExec(const MachineInstr &MI) {
5058 if (SIInstrInfo::isVALU(MI)) {
5059 switch (MI.getOpcode()) {
5060 case AMDGPU::V_READLANE_B32:
5061 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5062 case AMDGPU::V_WRITELANE_B32:
5063 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5064 return false;
5065 }
5066
5067 return true;
5068 }
5069
5070 if (MI.isPreISelOpcode() ||
5071 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5074 return false;
5075
5076 return true;
5077}
5078
5079static bool isRegOrFI(const MachineOperand &MO) {
5080 return MO.isReg() || MO.isFI();
5081}
5082
5083static bool isSubRegOf(const SIRegisterInfo &TRI,
5084 const MachineOperand &SuperVec,
5085 const MachineOperand &SubReg) {
5086 if (SubReg.getReg().isPhysical())
5087 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5088
5089 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5090 SubReg.getReg() == SuperVec.getReg();
5091}
5092
5093// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5094bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5095 const MachineRegisterInfo &MRI,
5096 StringRef &ErrInfo) const {
5097 Register DstReg = MI.getOperand(0).getReg();
5098 Register SrcReg = MI.getOperand(1).getReg();
5099 // This is a check for copy from vector register to SGPR
5100 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5101 ErrInfo = "illegal copy from vector register to SGPR";
5102 return false;
5103 }
5104 return true;
5105}
5106
5108 StringRef &ErrInfo) const {
5109 uint16_t Opcode = MI.getOpcode();
5110 const MachineFunction *MF = MI.getMF();
5111 const MachineRegisterInfo &MRI = MF->getRegInfo();
5112
5113 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5114 // Find a better property to recognize the point where instruction selection
5115 // is just done.
5116 // We can only enforce this check after SIFixSGPRCopies pass so that the
5117 // illegal copies are legalized and thereafter we don't expect a pass
5118 // inserting similar copies.
5119 if (!MRI.isSSA() && MI.isCopy())
5120 return verifyCopy(MI, MRI, ErrInfo);
5121
5122 if (SIInstrInfo::isGenericOpcode(Opcode))
5123 return true;
5124
5125 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5126 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5127 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5128 int Src3Idx = -1;
5129 if (Src0Idx == -1) {
5130 // VOPD V_DUAL_* instructions use different operand names.
5131 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5132 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5133 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5134 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5135 }
5136
5137 // Make sure the number of operands is correct.
5138 const MCInstrDesc &Desc = get(Opcode);
5139 if (!Desc.isVariadic() &&
5140 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5141 ErrInfo = "Instruction has wrong number of operands.";
5142 return false;
5143 }
5144
5145 if (MI.isInlineAsm()) {
5146 // Verify register classes for inlineasm constraints.
5147 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5148 I != E; ++I) {
5149 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5150 if (!RC)
5151 continue;
5152
5153 const MachineOperand &Op = MI.getOperand(I);
5154 if (!Op.isReg())
5155 continue;
5156
5157 Register Reg = Op.getReg();
5158 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5159 ErrInfo = "inlineasm operand has incorrect register class.";
5160 return false;
5161 }
5162 }
5163
5164 return true;
5165 }
5166
5167 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5168 ErrInfo = "missing memory operand from image instruction.";
5169 return false;
5170 }
5171
5172 // Make sure the register classes are correct.
5173 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5174 const MachineOperand &MO = MI.getOperand(i);
5175 if (MO.isFPImm()) {
5176 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5177 "all fp values to integers.";
5178 return false;
5179 }
5180
5181 const MCOperandInfo &OpInfo = Desc.operands()[i];
5182 int16_t RegClass = getOpRegClassID(OpInfo);
5183
5184 switch (OpInfo.OperandType) {
5186 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5187 ErrInfo = "Illegal immediate value for operand.";
5188 return false;
5189 }
5190 break;
5204 break;
5206 break;
5207 break;
5221 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5222 ErrInfo = "Illegal immediate value for operand.";
5223 return false;
5224 }
5225 break;
5226 }
5228 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5229 ErrInfo = "Expected inline constant for operand.";
5230 return false;
5231 }
5232 break;
5236 break;
5241 // Check if this operand is an immediate.
5242 // FrameIndex operands will be replaced by immediates, so they are
5243 // allowed.
5244 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5245 ErrInfo = "Expected immediate, but got non-immediate";
5246 return false;
5247 }
5248 break;
5252 break;
5253 default:
5254 if (OpInfo.isGenericType())
5255 continue;
5256 break;
5257 }
5258
5259 if (!MO.isReg())
5260 continue;
5261 Register Reg = MO.getReg();
5262 if (!Reg)
5263 continue;
5264
5265 // FIXME: Ideally we would have separate instruction definitions with the
5266 // aligned register constraint.
5267 // FIXME: We do not verify inline asm operands, but custom inline asm
5268 // verification is broken anyway
5269 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5270 Opcode != AMDGPU::V_MOV_B64_PSEUDO) {
5271 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5272 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5273 if (const TargetRegisterClass *SubRC =
5274 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5275 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5276 if (RC)
5277 RC = SubRC;
5278 }
5279 }
5280
5281 // Check that this is the aligned version of the class.
5282 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5283 ErrInfo = "Subtarget requires even aligned vector registers";
5284 return false;
5285 }
5286 }
5287
5288 if (RegClass != -1) {
5289 if (Reg.isVirtual())
5290 continue;
5291
5292 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5293 if (!RC->contains(Reg)) {
5294 ErrInfo = "Operand has incorrect register class.";
5295 return false;
5296 }
5297 }
5298 }
5299
5300 // Verify SDWA
5301 if (isSDWA(MI)) {
5302 if (!ST.hasSDWA()) {
5303 ErrInfo = "SDWA is not supported on this target";
5304 return false;
5305 }
5306
5307 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5308 AMDGPU::OpName::dst_sel}) {
5309 const MachineOperand *MO = getNamedOperand(MI, Op);
5310 if (!MO)
5311 continue;
5312 int64_t Imm = MO->getImm();
5313 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5314 ErrInfo = "Invalid SDWA selection";
5315 return false;
5316 }
5317 }
5318
5319 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5320
5321 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5322 if (OpIdx == -1)
5323 continue;
5324 const MachineOperand &MO = MI.getOperand(OpIdx);
5325
5326 if (!ST.hasSDWAScalar()) {
5327 // Only VGPRS on VI
5328 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5329 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5330 return false;
5331 }
5332 } else {
5333 // No immediates on GFX9
5334 if (!MO.isReg()) {
5335 ErrInfo =
5336 "Only reg allowed as operands in SDWA instructions on GFX9+";
5337 return false;
5338 }
5339 }
5340 }
5341
5342 if (!ST.hasSDWAOmod()) {
5343 // No omod allowed on VI
5344 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5345 if (OMod != nullptr &&
5346 (!OMod->isImm() || OMod->getImm() != 0)) {
5347 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5348 return false;
5349 }
5350 }
5351
5352 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5353 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5354 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5355 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5356 const MachineOperand *Src0ModsMO =
5357 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5358 unsigned Mods = Src0ModsMO->getImm();
5359 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5360 Mods & SISrcMods::SEXT) {
5361 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5362 return false;
5363 }
5364 }
5365
5366 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5367 if (isVOPC(BasicOpcode)) {
5368 if (!ST.hasSDWASdst() && DstIdx != -1) {
5369 // Only vcc allowed as dst on VI for VOPC
5370 const MachineOperand &Dst = MI.getOperand(DstIdx);
5371 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5372 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5373 return false;
5374 }
5375 } else if (!ST.hasSDWAOutModsVOPC()) {
5376 // No clamp allowed on GFX9 for VOPC
5377 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5378 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5379 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5380 return false;
5381 }
5382
5383 // No omod allowed on GFX9 for VOPC
5384 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5385 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5386 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5387 return false;
5388 }
5389 }
5390 }
5391
5392 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5393 if (DstUnused && DstUnused->isImm() &&
5394 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5395 const MachineOperand &Dst = MI.getOperand(DstIdx);
5396 if (!Dst.isReg() || !Dst.isTied()) {
5397 ErrInfo = "Dst register should have tied register";
5398 return false;
5399 }
5400
5401 const MachineOperand &TiedMO =
5402 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5403 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5404 ErrInfo =
5405 "Dst register should be tied to implicit use of preserved register";
5406 return false;
5407 }
5408 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5409 ErrInfo = "Dst register should use same physical register as preserved";
5410 return false;
5411 }
5412 }
5413 }
5414
5415 // Verify MIMG / VIMAGE / VSAMPLE
5416 if (isImage(Opcode) && !MI.mayStore()) {
5417 // Ensure that the return type used is large enough for all the options
5418 // being used TFE/LWE require an extra result register.
5419 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5420 if (DMask) {
5421 uint64_t DMaskImm = DMask->getImm();
5422 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5423 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5424 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5425 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5426
5427 // Adjust for packed 16 bit values
5428 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5429 RegCount = divideCeil(RegCount, 2);
5430
5431 // Adjust if using LWE or TFE
5432 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5433 RegCount += 1;
5434
5435 const uint32_t DstIdx =
5436 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5437 const MachineOperand &Dst = MI.getOperand(DstIdx);
5438 if (Dst.isReg()) {
5439 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5440 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5441 if (RegCount > DstSize) {
5442 ErrInfo = "Image instruction returns too many registers for dst "
5443 "register class";
5444 return false;
5445 }
5446 }
5447 }
5448 }
5449
5450 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5451 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5452 unsigned ConstantBusCount = 0;
5453 bool UsesLiteral = false;
5454 const MachineOperand *LiteralVal = nullptr;
5455
5456 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5457 if (ImmIdx != -1) {
5458 ++ConstantBusCount;
5459 UsesLiteral = true;
5460 LiteralVal = &MI.getOperand(ImmIdx);
5461 }
5462
5463 SmallVector<Register, 2> SGPRsUsed;
5464 Register SGPRUsed;
5465
5466 // Only look at the true operands. Only a real operand can use the constant
5467 // bus, and we don't want to check pseudo-operands like the source modifier
5468 // flags.
5469 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5470 if (OpIdx == -1)
5471 continue;
5472 const MachineOperand &MO = MI.getOperand(OpIdx);
5473 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5474 if (MO.isReg()) {
5475 SGPRUsed = MO.getReg();
5476 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5477 ++ConstantBusCount;
5478 SGPRsUsed.push_back(SGPRUsed);
5479 }
5480 } else if (!MO.isFI()) { // Treat FI like a register.
5481 if (!UsesLiteral) {
5482 ++ConstantBusCount;
5483 UsesLiteral = true;
5484 LiteralVal = &MO;
5485 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5486 assert(isVOP2(MI) || isVOP3(MI));
5487 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5488 return false;
5489 }
5490 }
5491 }
5492 }
5493
5494 SGPRUsed = findImplicitSGPRRead(MI);
5495 if (SGPRUsed) {
5496 // Implicit uses may safely overlap true operands
5497 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5498 return !RI.regsOverlap(SGPRUsed, SGPR);
5499 })) {
5500 ++ConstantBusCount;
5501 SGPRsUsed.push_back(SGPRUsed);
5502 }
5503 }
5504
5505 // v_writelane_b32 is an exception from constant bus restriction:
5506 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5507 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5508 Opcode != AMDGPU::V_WRITELANE_B32) {
5509 ErrInfo = "VOP* instruction violates constant bus restriction";
5510 return false;
5511 }
5512
5513 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5514 ErrInfo = "VOP3 instruction uses literal";
5515 return false;
5516 }
5517 }
5518
5519 // Special case for writelane - this can break the multiple constant bus rule,
5520 // but still can't use more than one SGPR register
5521 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5522 unsigned SGPRCount = 0;
5523 Register SGPRUsed;
5524
5525 for (int OpIdx : {Src0Idx, Src1Idx}) {
5526 if (OpIdx == -1)
5527 break;
5528
5529 const MachineOperand &MO = MI.getOperand(OpIdx);
5530
5531 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5532 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5533 if (MO.getReg() != SGPRUsed)
5534 ++SGPRCount;
5535 SGPRUsed = MO.getReg();
5536 }
5537 }
5538 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5539 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5540 return false;
5541 }
5542 }
5543 }
5544
5545 // Verify misc. restrictions on specific instructions.
5546 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5547 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5548 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5549 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5550 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5551 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5552 if (!compareMachineOp(Src0, Src1) &&
5553 !compareMachineOp(Src0, Src2)) {
5554 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5555 return false;
5556 }
5557 }
5558 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5559 SISrcMods::ABS) ||
5560 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5561 SISrcMods::ABS) ||
5562 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5563 SISrcMods::ABS)) {
5564 ErrInfo = "ABS not allowed in VOP3B instructions";
5565 return false;
5566 }
5567 }
5568
5569 if (isSOP2(MI) || isSOPC(MI)) {
5570 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5571 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5572
5573 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5574 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5575 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5576 !Src0.isIdenticalTo(Src1)) {
5577 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5578 return false;
5579 }
5580 }
5581
5582 if (isSOPK(MI)) {
5583 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5584 if (Desc.isBranch()) {
5585 if (!Op->isMBB()) {
5586 ErrInfo = "invalid branch target for SOPK instruction";
5587 return false;
5588 }
5589 } else {
5590 uint64_t Imm = Op->getImm();
5591 if (sopkIsZext(Opcode)) {
5592 if (!isUInt<16>(Imm)) {
5593 ErrInfo = "invalid immediate for SOPK instruction";
5594 return false;
5595 }
5596 } else {
5597 if (!isInt<16>(Imm)) {
5598 ErrInfo = "invalid immediate for SOPK instruction";
5599 return false;
5600 }
5601 }
5602 }
5603 }
5604
5605 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5606 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5607 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5608 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5609 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5610 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5611
5612 const unsigned StaticNumOps =
5613 Desc.getNumOperands() + Desc.implicit_uses().size();
5614 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5615
5616 // Require additional implicit operands. This allows a fixup done by the
5617 // post RA scheduler where the main implicit operand is killed and
5618 // implicit-defs are added for sub-registers that remain live after this
5619 // instruction.
5620 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5621 ErrInfo = "missing implicit register operands";
5622 return false;
5623 }
5624
5625 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5626 if (IsDst) {
5627 if (!Dst->isUse()) {
5628 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5629 return false;
5630 }
5631
5632 unsigned UseOpIdx;
5633 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5634 UseOpIdx != StaticNumOps + 1) {
5635 ErrInfo = "movrel implicit operands should be tied";
5636 return false;
5637 }
5638 }
5639
5640 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5641 const MachineOperand &ImpUse
5642 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5643 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5644 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5645 ErrInfo = "src0 should be subreg of implicit vector use";
5646 return false;
5647 }
5648 }
5649
5650 // Make sure we aren't losing exec uses in the td files. This mostly requires
5651 // being careful when using let Uses to try to add other use registers.
5652 if (shouldReadExec(MI)) {
5653 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5654 ErrInfo = "VALU instruction does not implicitly read exec mask";
5655 return false;
5656 }
5657 }
5658
5659 if (isSMRD(MI)) {
5660 if (MI.mayStore() &&
5661 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5662 // The register offset form of scalar stores may only use m0 as the
5663 // soffset register.
5664 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5665 if (Soff && Soff->getReg() != AMDGPU::M0) {
5666 ErrInfo = "scalar stores must use m0 as offset register";
5667 return false;
5668 }
5669 }
5670 }
5671
5672 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5673 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5674 if (Offset->getImm() != 0) {
5675 ErrInfo = "subtarget does not support offsets in flat instructions";
5676 return false;
5677 }
5678 }
5679
5680 if (isDS(MI) && !ST.hasGDS()) {
5681 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5682 if (GDSOp && GDSOp->getImm() != 0) {
5683 ErrInfo = "GDS is not supported on this subtarget";
5684 return false;
5685 }
5686 }
5687
5688 if (isImage(MI)) {
5689 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5690 if (DimOp) {
5691 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5692 AMDGPU::OpName::vaddr0);
5693 AMDGPU::OpName RSrcOpName =
5694 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5695 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5696 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5697 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5698 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5699 const AMDGPU::MIMGDimInfo *Dim =
5701
5702 if (!Dim) {
5703 ErrInfo = "dim is out of range";
5704 return false;
5705 }
5706
5707 bool IsA16 = false;
5708 if (ST.hasR128A16()) {
5709 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5710 IsA16 = R128A16->getImm() != 0;
5711 } else if (ST.hasA16()) {
5712 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5713 IsA16 = A16->getImm() != 0;
5714 }
5715
5716 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5717
5718 unsigned AddrWords =
5719 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5720
5721 unsigned VAddrWords;
5722 if (IsNSA) {
5723 VAddrWords = RsrcIdx - VAddr0Idx;
5724 if (ST.hasPartialNSAEncoding() &&
5725 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5726 unsigned LastVAddrIdx = RsrcIdx - 1;
5727 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5728 }
5729 } else {
5730 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5731 if (AddrWords > 12)
5732 AddrWords = 16;
5733 }
5734
5735 if (VAddrWords != AddrWords) {
5736 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5737 << " but got " << VAddrWords << "\n");
5738 ErrInfo = "bad vaddr size";
5739 return false;
5740 }
5741 }
5742 }
5743
5744 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5745 if (DppCt) {
5746 using namespace AMDGPU::DPP;
5747
5748 unsigned DC = DppCt->getImm();
5749 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5750 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5751 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5752 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5753 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5754 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5755 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5756 ErrInfo = "Invalid dpp_ctrl value";
5757 return false;
5758 }
5759 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5760 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5761 ErrInfo = "Invalid dpp_ctrl value: "
5762 "wavefront shifts are not supported on GFX10+";
5763 return false;
5764 }
5765 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5766 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5767 ErrInfo = "Invalid dpp_ctrl value: "
5768 "broadcasts are not supported on GFX10+";
5769 return false;
5770 }
5771 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5772 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5773 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5774 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5775 !ST.hasGFX90AInsts()) {
5776 ErrInfo = "Invalid dpp_ctrl value: "
5777 "row_newbroadcast/row_share is not supported before "
5778 "GFX90A/GFX10";
5779 return false;
5780 }
5781 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5782 ErrInfo = "Invalid dpp_ctrl value: "
5783 "row_share and row_xmask are not supported before GFX10";
5784 return false;
5785 }
5786 }
5787
5788 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5790 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5791 ErrInfo = "Invalid dpp_ctrl value: "
5792 "DP ALU dpp only support row_newbcast";
5793 return false;
5794 }
5795 }
5796
5797 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5798 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5799 AMDGPU::OpName DataName =
5800 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5801 const MachineOperand *Data = getNamedOperand(MI, DataName);
5802 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5803 if (Data && !Data->isReg())
5804 Data = nullptr;
5805
5806 if (ST.hasGFX90AInsts()) {
5807 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5808 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5809 ErrInfo = "Invalid register class: "
5810 "vdata and vdst should be both VGPR or AGPR";
5811 return false;
5812 }
5813 if (Data && Data2 &&
5814 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5815 ErrInfo = "Invalid register class: "
5816 "both data operands should be VGPR or AGPR";
5817 return false;
5818 }
5819 } else {
5820 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5821 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5822 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5823 ErrInfo = "Invalid register class: "
5824 "agpr loads and stores not supported on this GPU";
5825 return false;
5826 }
5827 }
5828 }
5829
5830 if (ST.needsAlignedVGPRs()) {
5831 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5833 if (!Op)
5834 return true;
5835 Register Reg = Op->getReg();
5836 if (Reg.isPhysical())
5837 return !(RI.getHWRegIndex(Reg) & 1);
5838 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5839 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5840 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5841 };
5842
5843 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5844 Opcode == AMDGPU::DS_GWS_BARRIER) {
5845
5846 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5847 ErrInfo = "Subtarget requires even aligned vector registers "
5848 "for DS_GWS instructions";
5849 return false;
5850 }
5851 }
5852
5853 if (isMIMG(MI)) {
5854 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5855 ErrInfo = "Subtarget requires even aligned vector registers "
5856 "for vaddr operand of image instructions";
5857 return false;
5858 }
5859 }
5860 }
5861
5862 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5863 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5864 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5865 ErrInfo = "Invalid register class: "
5866 "v_accvgpr_write with an SGPR is not supported on this GPU";
5867 return false;
5868 }
5869 }
5870
5871 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5872 const MachineOperand &SrcOp = MI.getOperand(1);
5873 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5874 ErrInfo = "pseudo expects only physical SGPRs";
5875 return false;
5876 }
5877 }
5878
5879 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5880 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5881 if (!ST.hasScaleOffset()) {
5882 ErrInfo = "Subtarget does not support offset scaling";
5883 return false;
5884 }
5885 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5886 ErrInfo = "Instruction does not support offset scaling";
5887 return false;
5888 }
5889 }
5890 }
5891
5892 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5893 // information.
5894 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5895 for (unsigned I = 0; I < 3; ++I) {
5897 return false;
5898 }
5899 }
5900
5901 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5902 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5903 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5904 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5905 &AMDGPU::SReg_64RegClass) ||
5906 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5907 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5908 return false;
5909 }
5910 }
5911
5912 return true;
5913}
5914
5915// It is more readable to list mapped opcodes on the same line.
5916// clang-format off
5917
5919 switch (MI.getOpcode()) {
5920 default: return AMDGPU::INSTRUCTION_LIST_END;
5921 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5922 case AMDGPU::COPY: return AMDGPU::COPY;
5923 case AMDGPU::PHI: return AMDGPU::PHI;
5924 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5925 case AMDGPU::WQM: return AMDGPU::WQM;
5926 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5927 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5928 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5929 case AMDGPU::S_MOV_B32: {
5930 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5931 return MI.getOperand(1).isReg() ||
5932 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5933 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5934 }
5935 case AMDGPU::S_ADD_I32:
5936 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5937 case AMDGPU::S_ADDC_U32:
5938 return AMDGPU::V_ADDC_U32_e32;
5939 case AMDGPU::S_SUB_I32:
5940 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5941 // FIXME: These are not consistently handled, and selected when the carry is
5942 // used.
5943 case AMDGPU::S_ADD_U32:
5944 return AMDGPU::V_ADD_CO_U32_e32;
5945 case AMDGPU::S_SUB_U32:
5946 return AMDGPU::V_SUB_CO_U32_e32;
5947 case AMDGPU::S_ADD_U64_PSEUDO:
5948 return AMDGPU::V_ADD_U64_PSEUDO;
5949 case AMDGPU::S_SUB_U64_PSEUDO:
5950 return AMDGPU::V_SUB_U64_PSEUDO;
5951 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5952 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5953 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5954 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5955 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5956 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5957 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5958 case AMDGPU::S_XNOR_B32:
5959 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5960 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5961 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5962 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5963 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5964 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5965 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5966 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5967 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5968 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5969 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5970 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5971 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5972 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5973 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5974 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5975 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5976 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5977 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5978 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5979 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5980 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5981 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5982 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5983 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5984 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5985 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5986 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5987 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5988 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5989 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5990 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5991 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5992 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5993 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5994 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5995 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5996 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5997 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5998 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5999 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6000 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6001 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6002 case AMDGPU::S_CVT_F32_F16:
6003 case AMDGPU::S_CVT_HI_F32_F16:
6004 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6005 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6006 case AMDGPU::S_CVT_F16_F32:
6007 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6008 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6009 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6010 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6011 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6012 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6013 case AMDGPU::S_CEIL_F16:
6014 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6015 : AMDGPU::V_CEIL_F16_fake16_e64;
6016 case AMDGPU::S_FLOOR_F16:
6017 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6018 : AMDGPU::V_FLOOR_F16_fake16_e64;
6019 case AMDGPU::S_TRUNC_F16:
6020 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6021 : AMDGPU::V_TRUNC_F16_fake16_e64;
6022 case AMDGPU::S_RNDNE_F16:
6023 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6024 : AMDGPU::V_RNDNE_F16_fake16_e64;
6025 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6026 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6027 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6028 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6029 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6030 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6031 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6032 case AMDGPU::S_ADD_F16:
6033 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6034 : AMDGPU::V_ADD_F16_fake16_e64;
6035 case AMDGPU::S_SUB_F16:
6036 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6037 : AMDGPU::V_SUB_F16_fake16_e64;
6038 case AMDGPU::S_MIN_F16:
6039 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6040 : AMDGPU::V_MIN_F16_fake16_e64;
6041 case AMDGPU::S_MAX_F16:
6042 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6043 : AMDGPU::V_MAX_F16_fake16_e64;
6044 case AMDGPU::S_MINIMUM_F16:
6045 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6046 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6047 case AMDGPU::S_MAXIMUM_F16:
6048 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6049 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6050 case AMDGPU::S_MUL_F16:
6051 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6052 : AMDGPU::V_MUL_F16_fake16_e64;
6053 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6054 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6055 case AMDGPU::S_FMAC_F16:
6056 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6057 : AMDGPU::V_FMAC_F16_fake16_e64;
6058 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6059 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6060 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6061 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6062 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6063 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6064 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6065 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6066 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6067 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6068 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6069 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6070 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6071 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6072 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6073 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6074 case AMDGPU::S_CMP_LT_F16:
6075 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6076 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6077 case AMDGPU::S_CMP_EQ_F16:
6078 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6079 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6080 case AMDGPU::S_CMP_LE_F16:
6081 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6082 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6083 case AMDGPU::S_CMP_GT_F16:
6084 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6085 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6086 case AMDGPU::S_CMP_LG_F16:
6087 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6088 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6089 case AMDGPU::S_CMP_GE_F16:
6090 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6091 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6092 case AMDGPU::S_CMP_O_F16:
6093 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6094 : AMDGPU::V_CMP_O_F16_fake16_e64;
6095 case AMDGPU::S_CMP_U_F16:
6096 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6097 : AMDGPU::V_CMP_U_F16_fake16_e64;
6098 case AMDGPU::S_CMP_NGE_F16:
6099 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6100 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6101 case AMDGPU::S_CMP_NLG_F16:
6102 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6103 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6104 case AMDGPU::S_CMP_NGT_F16:
6105 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6106 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6107 case AMDGPU::S_CMP_NLE_F16:
6108 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6109 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6110 case AMDGPU::S_CMP_NEQ_F16:
6111 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6112 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6113 case AMDGPU::S_CMP_NLT_F16:
6114 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6115 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6116 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6117 case AMDGPU::V_S_EXP_F16_e64:
6118 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6119 : AMDGPU::V_EXP_F16_fake16_e64;
6120 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6121 case AMDGPU::V_S_LOG_F16_e64:
6122 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6123 : AMDGPU::V_LOG_F16_fake16_e64;
6124 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6125 case AMDGPU::V_S_RCP_F16_e64:
6126 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6127 : AMDGPU::V_RCP_F16_fake16_e64;
6128 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6129 case AMDGPU::V_S_RSQ_F16_e64:
6130 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6131 : AMDGPU::V_RSQ_F16_fake16_e64;
6132 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6133 case AMDGPU::V_S_SQRT_F16_e64:
6134 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6135 : AMDGPU::V_SQRT_F16_fake16_e64;
6136 }
6138 "Unexpected scalar opcode without corresponding vector one!");
6139}
6140
6141// clang-format on
6142
6146 const DebugLoc &DL, Register Reg,
6147 bool IsSCCLive,
6148 SlotIndexes *Indexes) const {
6149 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6150 const SIInstrInfo *TII = ST.getInstrInfo();
6152 if (IsSCCLive) {
6153 // Insert two move instructions, one to save the original value of EXEC and
6154 // the other to turn on all bits in EXEC. This is required as we can't use
6155 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6156 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6158 auto FlipExecMI =
6159 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6160 if (Indexes) {
6161 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6162 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6163 }
6164 } else {
6165 auto SaveExec =
6166 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6167 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6168 if (Indexes)
6169 Indexes->insertMachineInstrInMaps(*SaveExec);
6170 }
6171}
6172
6175 const DebugLoc &DL, Register Reg,
6176 SlotIndexes *Indexes) const {
6178 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6179 .addReg(Reg, RegState::Kill);
6180 if (Indexes)
6181 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6182}
6183
6187 "Not a whole wave func");
6188 MachineBasicBlock &MBB = *MF.begin();
6189 for (MachineInstr &MI : MBB)
6190 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6191 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6192 return &MI;
6193
6194 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6195}
6196
6198 unsigned OpNo) const {
6199 const MCInstrDesc &Desc = get(MI.getOpcode());
6200 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6201 Desc.operands()[OpNo].RegClass == -1) {
6202 Register Reg = MI.getOperand(OpNo).getReg();
6203
6204 if (Reg.isVirtual()) {
6205 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6206 return MRI.getRegClass(Reg);
6207 }
6208 return RI.getPhysRegBaseClass(Reg);
6209 }
6210
6211 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6212 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6213}
6214
6217 MachineBasicBlock *MBB = MI.getParent();
6218 MachineOperand &MO = MI.getOperand(OpIdx);
6219 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6220 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6221 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6222 unsigned Size = RI.getRegSizeInBits(*RC);
6223 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6224 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6225 : AMDGPU::V_MOV_B32_e32;
6226 if (MO.isReg())
6227 Opcode = AMDGPU::COPY;
6228 else if (RI.isSGPRClass(RC))
6229 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6230
6231 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6232 Register Reg = MRI.createVirtualRegister(VRC);
6233 DebugLoc DL = MBB->findDebugLoc(I);
6234 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6235 MO.ChangeToRegister(Reg, false);
6236}
6237
6240 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6241 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6242 if (!SuperReg.getReg().isVirtual())
6243 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6244
6245 MachineBasicBlock *MBB = MI->getParent();
6246 const DebugLoc &DL = MI->getDebugLoc();
6247 Register SubReg = MRI.createVirtualRegister(SubRC);
6248
6249 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6250 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6251 .addReg(SuperReg.getReg(), {}, NewSubIdx);
6252 return SubReg;
6253}
6254
6257 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6258 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6259 if (Op.isImm()) {
6260 if (SubIdx == AMDGPU::sub0)
6261 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6262 if (SubIdx == AMDGPU::sub1)
6263 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6264
6265 llvm_unreachable("Unhandled register index for immediate");
6266 }
6267
6268 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6269 SubIdx, SubRC);
6270 return MachineOperand::CreateReg(SubReg, false);
6271}
6272
6273// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6274void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6275 assert(Inst.getNumExplicitOperands() == 3);
6276 MachineOperand Op1 = Inst.getOperand(1);
6277 Inst.removeOperand(1);
6278 Inst.addOperand(Op1);
6279}
6280
6282 const MCOperandInfo &OpInfo,
6283 const MachineOperand &MO) const {
6284 if (!MO.isReg())
6285 return false;
6286
6287 Register Reg = MO.getReg();
6288
6289 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6290 if (Reg.isPhysical())
6291 return DRC->contains(Reg);
6292
6293 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6294
6295 if (MO.getSubReg()) {
6296 const MachineFunction *MF = MO.getParent()->getMF();
6297 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6298 if (!SuperRC)
6299 return false;
6300 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6301 }
6302
6303 return RI.getCommonSubClass(DRC, RC) != nullptr;
6304}
6305
6307 const MachineOperand &MO) const {
6308 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6309 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6310 unsigned Opc = MI.getOpcode();
6311
6312 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6313 // information.
6314 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6315 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6316 constexpr AMDGPU::OpName OpNames[] = {
6317 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6318
6319 for (auto [I, OpName] : enumerate(OpNames)) {
6320 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6321 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6323 return false;
6324 }
6325 }
6326
6327 if (!isLegalRegOperand(MRI, OpInfo, MO))
6328 return false;
6329
6330 // check Accumulate GPR operand
6331 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6332 if (IsAGPR && !ST.hasMAIInsts())
6333 return false;
6334 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6335 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6336 return false;
6337 // Atomics should have both vdst and vdata either vgpr or agpr.
6338 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6339 const int DataIdx = AMDGPU::getNamedOperandIdx(
6340 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6341 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6342 MI.getOperand(DataIdx).isReg() &&
6343 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6344 return false;
6345 if ((int)OpIdx == DataIdx) {
6346 if (VDstIdx != -1 &&
6347 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6348 return false;
6349 // DS instructions with 2 src operands also must have tied RC.
6350 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6351 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6352 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6353 return false;
6354 }
6355
6356 // Check V_ACCVGPR_WRITE_B32_e64
6357 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6358 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6359 RI.isSGPRReg(MRI, MO.getReg()))
6360 return false;
6361
6362 if (ST.hasFlatScratchHiInB64InstHazard() &&
6363 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6364 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6365 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6366 64)
6367 return false;
6368 }
6369 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6370 return false;
6371 }
6372
6373 return true;
6374}
6375
6377 const MCOperandInfo &OpInfo,
6378 const MachineOperand &MO) const {
6379 if (MO.isReg())
6380 return isLegalRegOperand(MRI, OpInfo, MO);
6381
6382 // Handle non-register types that are treated like immediates.
6383 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6384 return true;
6385}
6386
6388 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6389 const MachineOperand *MO) const {
6390 constexpr unsigned NumOps = 3;
6391 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6392 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6393 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6394 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6395
6396 assert(SrcN < NumOps);
6397
6398 if (!MO) {
6399 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6400 if (SrcIdx == -1)
6401 return true;
6402 MO = &MI.getOperand(SrcIdx);
6403 }
6404
6405 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6406 return true;
6407
6408 int ModsIdx =
6409 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6410 if (ModsIdx == -1)
6411 return true;
6412
6413 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6414 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6415 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6416
6417 return !OpSel && !OpSelHi;
6418}
6419
6421 const MachineOperand *MO) const {
6422 const MachineFunction &MF = *MI.getMF();
6423 const MachineRegisterInfo &MRI = MF.getRegInfo();
6424 const MCInstrDesc &InstDesc = MI.getDesc();
6425 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6426 int64_t RegClass = getOpRegClassID(OpInfo);
6427 const TargetRegisterClass *DefinedRC =
6428 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6429 if (!MO)
6430 MO = &MI.getOperand(OpIdx);
6431
6432 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6433
6434 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6435 const MachineOperand *UsedLiteral = nullptr;
6436
6437 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6438 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6439
6440 // TODO: Be more permissive with frame indexes.
6441 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6442 if (!LiteralLimit--)
6443 return false;
6444
6445 UsedLiteral = MO;
6446 }
6447
6449 if (MO->isReg())
6450 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6451
6452 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6453 if (i == OpIdx)
6454 continue;
6455 const MachineOperand &Op = MI.getOperand(i);
6456 if (Op.isReg()) {
6457 if (Op.isUse()) {
6458 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6459 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6460 if (--ConstantBusLimit <= 0)
6461 return false;
6462 }
6463 }
6464 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6465 !isInlineConstant(Op, InstDesc.operands()[i])) {
6466 // The same literal may be used multiple times.
6467 if (!UsedLiteral)
6468 UsedLiteral = &Op;
6469 else if (UsedLiteral->isIdenticalTo(Op))
6470 continue;
6471
6472 if (!LiteralLimit--)
6473 return false;
6474 if (--ConstantBusLimit <= 0)
6475 return false;
6476 }
6477 }
6478 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6479 // There can be at most one literal operand, but it can be repeated.
6480 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6481 if (i == OpIdx)
6482 continue;
6483 const MachineOperand &Op = MI.getOperand(i);
6484 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6485 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6486 !Op.isIdenticalTo(*MO))
6487 return false;
6488
6489 // Do not fold a non-inlineable and non-register operand into an
6490 // instruction that already has a frame index. The frame index handling
6491 // code could not handle well when a frame index co-exists with another
6492 // non-register operand, unless that operand is an inlineable immediate.
6493 if (Op.isFI())
6494 return false;
6495 }
6496 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6497 isF16PseudoScalarTrans(MI.getOpcode())) {
6498 return false;
6499 }
6500
6501 if (MO->isReg()) {
6502 if (!DefinedRC)
6503 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6504 return isLegalRegOperand(MI, OpIdx, *MO);
6505 }
6506
6507 if (MO->isImm()) {
6508 uint64_t Imm = MO->getImm();
6509 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6510 bool Is64BitOp = Is64BitFPOp ||
6511 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6512 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6513 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6514 if (Is64BitOp &&
6515 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6516 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6517 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6518 return false;
6519
6520 // FIXME: We can use sign extended 64-bit literals, but only for signed
6521 // operands. At the moment we do not know if an operand is signed.
6522 // Such operand will be encoded as its low 32 bits and then either
6523 // correctly sign extended or incorrectly zero extended by HW.
6524 // If 64-bit literals are supported and the literal will be encoded
6525 // as full 64 bit we still can use it.
6526 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6527 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6528 return false;
6529 }
6530 }
6531
6532 // Handle non-register types that are treated like immediates.
6533 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6534
6535 if (!DefinedRC) {
6536 // This operand expects an immediate.
6537 return true;
6538 }
6539
6540 return isImmOperandLegal(MI, OpIdx, *MO);
6541}
6542
6544 bool IsGFX950Only = ST.hasGFX950Insts();
6545 bool IsGFX940Only = ST.hasGFX940Insts();
6546
6547 if (!IsGFX950Only && !IsGFX940Only)
6548 return false;
6549
6550 if (!isVALU(MI))
6551 return false;
6552
6553 // V_COS, V_EXP, V_RCP, etc.
6554 if (isTRANS(MI))
6555 return true;
6556
6557 // DOT2, DOT2C, DOT4, etc.
6558 if (isDOT(MI))
6559 return true;
6560
6561 // MFMA, SMFMA
6562 if (isMFMA(MI))
6563 return true;
6564
6565 unsigned Opcode = MI.getOpcode();
6566 switch (Opcode) {
6567 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6568 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6569 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6570 case AMDGPU::V_MQSAD_U32_U8_e64:
6571 case AMDGPU::V_PK_ADD_F16:
6572 case AMDGPU::V_PK_ADD_F32:
6573 case AMDGPU::V_PK_ADD_I16:
6574 case AMDGPU::V_PK_ADD_U16:
6575 case AMDGPU::V_PK_ASHRREV_I16:
6576 case AMDGPU::V_PK_FMA_F16:
6577 case AMDGPU::V_PK_FMA_F32:
6578 case AMDGPU::V_PK_FMAC_F16_e32:
6579 case AMDGPU::V_PK_FMAC_F16_e64:
6580 case AMDGPU::V_PK_LSHLREV_B16:
6581 case AMDGPU::V_PK_LSHRREV_B16:
6582 case AMDGPU::V_PK_MAD_I16:
6583 case AMDGPU::V_PK_MAD_U16:
6584 case AMDGPU::V_PK_MAX_F16:
6585 case AMDGPU::V_PK_MAX_I16:
6586 case AMDGPU::V_PK_MAX_U16:
6587 case AMDGPU::V_PK_MIN_F16:
6588 case AMDGPU::V_PK_MIN_I16:
6589 case AMDGPU::V_PK_MIN_U16:
6590 case AMDGPU::V_PK_MOV_B32:
6591 case AMDGPU::V_PK_MUL_F16:
6592 case AMDGPU::V_PK_MUL_F32:
6593 case AMDGPU::V_PK_MUL_LO_U16:
6594 case AMDGPU::V_PK_SUB_I16:
6595 case AMDGPU::V_PK_SUB_U16:
6596 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6597 return true;
6598 default:
6599 return false;
6600 }
6601}
6602
6604 MachineInstr &MI) const {
6605 unsigned Opc = MI.getOpcode();
6606 const MCInstrDesc &InstrDesc = get(Opc);
6607
6608 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6609 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6610
6611 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6612 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6613
6614 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6615 // we need to only have one constant bus use before GFX10.
6616 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6617 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6618 RI.isSGPRReg(MRI, Src0.getReg()))
6619 legalizeOpWithMove(MI, Src0Idx);
6620
6621 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6622 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6623 // src0/src1 with V_READFIRSTLANE.
6624 if (Opc == AMDGPU::V_WRITELANE_B32) {
6625 const DebugLoc &DL = MI.getDebugLoc();
6626 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6627 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6628 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6629 .add(Src0);
6630 Src0.ChangeToRegister(Reg, false);
6631 }
6632 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6633 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6634 const DebugLoc &DL = MI.getDebugLoc();
6635 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6636 .add(Src1);
6637 Src1.ChangeToRegister(Reg, false);
6638 }
6639 return;
6640 }
6641
6642 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6643 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6644 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6645 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6646 legalizeOpWithMove(MI, Src2Idx);
6647 }
6648
6649 // VOP2 src0 instructions support all operand types, so we don't need to check
6650 // their legality. If src1 is already legal, we don't need to do anything.
6651 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6652 return;
6653
6654 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6655 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6656 // select is uniform.
6657 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6658 RI.isVGPR(MRI, Src1.getReg())) {
6659 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6660 const DebugLoc &DL = MI.getDebugLoc();
6661 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6662 .add(Src1);
6663 Src1.ChangeToRegister(Reg, false);
6664 return;
6665 }
6666
6667 // We do not use commuteInstruction here because it is too aggressive and will
6668 // commute if it is possible. We only want to commute here if it improves
6669 // legality. This can be called a fairly large number of times so don't waste
6670 // compile time pointlessly swapping and checking legality again.
6671 if (HasImplicitSGPR || !MI.isCommutable()) {
6672 legalizeOpWithMove(MI, Src1Idx);
6673 return;
6674 }
6675
6676 // If src0 can be used as src1, commuting will make the operands legal.
6677 // Otherwise we have to give up and insert a move.
6678 //
6679 // TODO: Other immediate-like operand kinds could be commuted if there was a
6680 // MachineOperand::ChangeTo* for them.
6681 if ((!Src1.isImm() && !Src1.isReg()) ||
6682 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6683 legalizeOpWithMove(MI, Src1Idx);
6684 return;
6685 }
6686
6687 int CommutedOpc = commuteOpcode(MI);
6688 if (CommutedOpc == -1) {
6689 legalizeOpWithMove(MI, Src1Idx);
6690 return;
6691 }
6692
6693 MI.setDesc(get(CommutedOpc));
6694
6695 Register Src0Reg = Src0.getReg();
6696 unsigned Src0SubReg = Src0.getSubReg();
6697 bool Src0Kill = Src0.isKill();
6698
6699 if (Src1.isImm())
6700 Src0.ChangeToImmediate(Src1.getImm());
6701 else if (Src1.isReg()) {
6702 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6703 Src0.setSubReg(Src1.getSubReg());
6704 } else
6705 llvm_unreachable("Should only have register or immediate operands");
6706
6707 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6708 Src1.setSubReg(Src0SubReg);
6710}
6711
6712// Legalize VOP3 operands. All operand types are supported for any operand
6713// but only one literal constant and only starting from GFX10.
6715 MachineInstr &MI) const {
6716 unsigned Opc = MI.getOpcode();
6717
6718 int VOP3Idx[3] = {
6719 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6720 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6721 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6722 };
6723
6724 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6725 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6726 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6727 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6728 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6729 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6730 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6731 // src1 and src2 must be scalar
6732 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6733 const DebugLoc &DL = MI.getDebugLoc();
6734 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6735 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6736 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6737 .add(Src1);
6738 Src1.ChangeToRegister(Reg, false);
6739 }
6740 if (VOP3Idx[2] != -1) {
6741 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6742 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6743 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6744 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6745 .add(Src2);
6746 Src2.ChangeToRegister(Reg, false);
6747 }
6748 }
6749 }
6750
6751 // Find the one SGPR operand we are allowed to use.
6752 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6753 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6754 SmallDenseSet<unsigned> SGPRsUsed;
6755 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6756 if (SGPRReg) {
6757 SGPRsUsed.insert(SGPRReg);
6758 --ConstantBusLimit;
6759 }
6760
6761 for (int Idx : VOP3Idx) {
6762 if (Idx == -1)
6763 break;
6764 MachineOperand &MO = MI.getOperand(Idx);
6765
6766 if (!MO.isReg()) {
6767 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6768 continue;
6769
6770 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6771 --LiteralLimit;
6772 --ConstantBusLimit;
6773 continue;
6774 }
6775
6776 --LiteralLimit;
6777 --ConstantBusLimit;
6778 legalizeOpWithMove(MI, Idx);
6779 continue;
6780 }
6781
6782 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6783 continue; // VGPRs are legal
6784
6785 // We can use one SGPR in each VOP3 instruction prior to GFX10
6786 // and two starting from GFX10.
6787 if (SGPRsUsed.count(MO.getReg()))
6788 continue;
6789 if (ConstantBusLimit > 0) {
6790 SGPRsUsed.insert(MO.getReg());
6791 --ConstantBusLimit;
6792 continue;
6793 }
6794
6795 // If we make it this far, then the operand is not legal and we must
6796 // legalize it.
6797 legalizeOpWithMove(MI, Idx);
6798 }
6799
6800 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6801 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6802 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6803 legalizeOpWithMove(MI, VOP3Idx[2]);
6804
6805 // Fix the register class of packed FP32 instructions on gfx12+. See
6806 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6808 for (unsigned I = 0; I < 3; ++I) {
6810 legalizeOpWithMove(MI, VOP3Idx[I]);
6811 }
6812 }
6813}
6814
6817 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6818 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6819 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6820 if (DstRC)
6821 SRC = RI.getCommonSubClass(SRC, DstRC);
6822
6823 Register DstReg = MRI.createVirtualRegister(SRC);
6824 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6825
6826 if (RI.hasAGPRs(VRC)) {
6827 VRC = RI.getEquivalentVGPRClass(VRC);
6828 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6829 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6830 get(TargetOpcode::COPY), NewSrcReg)
6831 .addReg(SrcReg);
6832 SrcReg = NewSrcReg;
6833 }
6834
6835 if (SubRegs == 1) {
6836 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6837 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6838 .addReg(SrcReg);
6839 return DstReg;
6840 }
6841
6843 for (unsigned i = 0; i < SubRegs; ++i) {
6844 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6845 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6846 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6847 .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
6848 SRegs.push_back(SGPR);
6849 }
6850
6852 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6853 get(AMDGPU::REG_SEQUENCE), DstReg);
6854 for (unsigned i = 0; i < SubRegs; ++i) {
6855 MIB.addReg(SRegs[i]);
6856 MIB.addImm(RI.getSubRegFromChannel(i));
6857 }
6858 return DstReg;
6859}
6860
6862 MachineInstr &MI) const {
6863
6864 // If the pointer is store in VGPRs, then we need to move them to
6865 // SGPRs using v_readfirstlane. This is safe because we only select
6866 // loads with uniform pointers to SMRD instruction so we know the
6867 // pointer value is uniform.
6868 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6869 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6870 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6871 SBase->setReg(SGPR);
6872 }
6873 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6874 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6875 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6876 SOff->setReg(SGPR);
6877 }
6878}
6879
6881 unsigned Opc = Inst.getOpcode();
6882 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6883 if (OldSAddrIdx < 0)
6884 return false;
6885
6886 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6887
6888 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6889 if (NewOpc < 0)
6891 if (NewOpc < 0)
6892 return false;
6893
6895 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6896 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6897 return false;
6898
6899 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6900 if (NewVAddrIdx < 0)
6901 return false;
6902
6903 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6904
6905 // Check vaddr, it shall be zero or absent.
6906 MachineInstr *VAddrDef = nullptr;
6907 if (OldVAddrIdx >= 0) {
6908 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6909 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6910 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6911 !VAddrDef->getOperand(1).isImm() ||
6912 VAddrDef->getOperand(1).getImm() != 0)
6913 return false;
6914 }
6915
6916 const MCInstrDesc &NewDesc = get(NewOpc);
6917 Inst.setDesc(NewDesc);
6918
6919 // Callers expect iterator to be valid after this call, so modify the
6920 // instruction in place.
6921 if (OldVAddrIdx == NewVAddrIdx) {
6922 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6923 // Clear use list from the old vaddr holding a zero register.
6924 MRI.removeRegOperandFromUseList(&NewVAddr);
6925 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6926 Inst.removeOperand(OldSAddrIdx);
6927 // Update the use list with the pointer we have just moved from vaddr to
6928 // saddr position. Otherwise new vaddr will be missing from the use list.
6929 MRI.removeRegOperandFromUseList(&NewVAddr);
6930 MRI.addRegOperandToUseList(&NewVAddr);
6931 } else {
6932 assert(OldSAddrIdx == NewVAddrIdx);
6933
6934 if (OldVAddrIdx >= 0) {
6935 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6936 AMDGPU::OpName::vdst_in);
6937
6938 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6939 // it asserts. Untie the operands for now and retie them afterwards.
6940 if (NewVDstIn != -1) {
6941 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6942 Inst.untieRegOperand(OldVDstIn);
6943 }
6944
6945 Inst.removeOperand(OldVAddrIdx);
6946
6947 if (NewVDstIn != -1) {
6948 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6949 Inst.tieOperands(NewVDst, NewVDstIn);
6950 }
6951 }
6952 }
6953
6954 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6955 VAddrDef->eraseFromParent();
6956
6957 return true;
6958}
6959
6960// FIXME: Remove this when SelectionDAG is obsoleted.
6962 MachineInstr &MI) const {
6963 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6964 return;
6965
6966 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6967 // thinks they are uniform, so a readfirstlane should be valid.
6968 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6969 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6970 return;
6971
6973 return;
6974
6975 const TargetRegisterClass *DeclaredRC =
6976 getRegClass(MI.getDesc(), SAddr->getOperandNo());
6977
6978 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6979 SAddr->setReg(ToSGPR);
6980}
6981
6984 const TargetRegisterClass *DstRC,
6987 const DebugLoc &DL) const {
6988 Register OpReg = Op.getReg();
6989 unsigned OpSubReg = Op.getSubReg();
6990
6991 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6992 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6993
6994 // Check if operand is already the correct register class.
6995 if (DstRC == OpRC)
6996 return;
6997
6998 Register DstReg = MRI.createVirtualRegister(DstRC);
6999 auto Copy =
7000 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
7001 Op.setReg(DstReg);
7002
7003 MachineInstr *Def = MRI.getVRegDef(OpReg);
7004 if (!Def)
7005 return;
7006
7007 // Try to eliminate the copy if it is copying an immediate value.
7008 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7009 foldImmediate(*Copy, *Def, OpReg, &MRI);
7010
7011 bool ImpDef = Def->isImplicitDef();
7012 while (!ImpDef && Def && Def->isCopy()) {
7013 if (Def->getOperand(1).getReg().isPhysical())
7014 break;
7015 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
7016 ImpDef = Def && Def->isImplicitDef();
7017 }
7018 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
7019 !ImpDef)
7020 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
7021}
7022
7023// Emit the actual waterfall loop, executing the wrapped instruction for each
7024// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7025// iteration, in the worst case we execute 64 (once per lane).
7026static void
7029 MachineBasicBlock &LoopBB,
7030 MachineBasicBlock &BodyBB,
7031 const DebugLoc &DL,
7032 ArrayRef<MachineOperand *> ScalarOps) {
7033 MachineFunction &MF = *LoopBB.getParent();
7034 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7035 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7037 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7038
7040 Register CondReg;
7041
7042 for (MachineOperand *ScalarOp : ScalarOps) {
7043 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7044 unsigned NumSubRegs = RegSize / 32;
7045 Register VScalarOp = ScalarOp->getReg();
7046
7047 if (NumSubRegs == 1) {
7048 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7049
7050 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7051 .addReg(VScalarOp);
7052
7053 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7054
7055 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7056 .addReg(CurReg)
7057 .addReg(VScalarOp);
7058
7059 // Combine the comparison results with AND.
7060 if (!CondReg) // First.
7061 CondReg = NewCondReg;
7062 else { // If not the first, we create an AND.
7063 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7064 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7065 .addReg(CondReg)
7066 .addReg(NewCondReg);
7067 CondReg = AndReg;
7068 }
7069
7070 // Update ScalarOp operand to use the SGPR ScalarOp.
7071 ScalarOp->setReg(CurReg);
7072 ScalarOp->setIsKill();
7073 } else {
7074 SmallVector<Register, 8> ReadlanePieces;
7075 RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7076 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7077 "Unhandled register size");
7078
7079 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7080 Register CurRegLo =
7081 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7082 Register CurRegHi =
7083 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7084
7085 // Read the next variant <- also loop target.
7086 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7087 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7088
7089 // Read the next variant <- also loop target.
7090 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7091 .addReg(VScalarOp, VScalarOpUndef,
7092 TRI->getSubRegFromChannel(Idx + 1));
7093
7094 ReadlanePieces.push_back(CurRegLo);
7095 ReadlanePieces.push_back(CurRegHi);
7096
7097 // Comparison is to be done as 64-bit.
7098 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7099 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7100 .addReg(CurRegLo)
7101 .addImm(AMDGPU::sub0)
7102 .addReg(CurRegHi)
7103 .addImm(AMDGPU::sub1);
7104
7105 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7106 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7107 NewCondReg)
7108 .addReg(CurReg);
7109 if (NumSubRegs <= 2)
7110 Cmp.addReg(VScalarOp);
7111 else
7112 Cmp.addReg(VScalarOp, VScalarOpUndef,
7113 TRI->getSubRegFromChannel(Idx, 2));
7114
7115 // Combine the comparison results with AND.
7116 if (!CondReg) // First.
7117 CondReg = NewCondReg;
7118 else { // If not the first, we create an AND.
7119 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7120 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7121 .addReg(CondReg)
7122 .addReg(NewCondReg);
7123 CondReg = AndReg;
7124 }
7125 } // End for loop.
7126
7127 const auto *SScalarOpRC =
7128 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7129 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7130
7131 // Build scalar ScalarOp.
7132 auto Merge =
7133 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7134 unsigned Channel = 0;
7135 for (Register Piece : ReadlanePieces) {
7136 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7137 }
7138
7139 // Update ScalarOp operand to use the SGPR ScalarOp.
7140 ScalarOp->setReg(SScalarOp);
7141 ScalarOp->setIsKill();
7142 }
7143 }
7144
7145 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7146 MRI.setSimpleHint(SaveExec, CondReg);
7147
7148 // Update EXEC to matching lanes, saving original to SaveExec.
7149 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7150 .addReg(CondReg, RegState::Kill);
7151
7152 // The original instruction is here; we insert the terminators after it.
7153 I = BodyBB.end();
7154
7155 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7156 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7157 .addReg(LMC.ExecReg)
7158 .addReg(SaveExec);
7159
7160 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7161}
7162
7163// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7164// with SGPRs by iterating over all unique values across all lanes.
7165// Returns the loop basic block that now contains \p MI.
7166static MachineBasicBlock *
7170 MachineBasicBlock::iterator Begin = nullptr,
7171 MachineBasicBlock::iterator End = nullptr) {
7172 MachineBasicBlock &MBB = *MI.getParent();
7173 MachineFunction &MF = *MBB.getParent();
7174 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7175 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7177 if (!Begin.isValid())
7178 Begin = &MI;
7179 if (!End.isValid()) {
7180 End = &MI;
7181 ++End;
7182 }
7183 const DebugLoc &DL = MI.getDebugLoc();
7185 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7186
7187 // Save SCC. Waterfall Loop may overwrite SCC.
7188 Register SaveSCCReg;
7189
7190 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7191 // rather than unlimited scan everywhere
7192 bool SCCNotDead =
7193 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7194 std::numeric_limits<unsigned>::max()) !=
7196 if (SCCNotDead) {
7197 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7198 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7199 .addImm(1)
7200 .addImm(0);
7201 }
7202
7203 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7204
7205 // Save the EXEC mask
7206 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7207
7208 // Killed uses in the instruction we are waterfalling around will be
7209 // incorrect due to the added control-flow.
7211 ++AfterMI;
7212 for (auto I = Begin; I != AfterMI; I++) {
7213 for (auto &MO : I->all_uses())
7214 MRI.clearKillFlags(MO.getReg());
7215 }
7216
7217 // To insert the loop we need to split the block. Move everything after this
7218 // point to a new block, and insert a new empty block between the two.
7221 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7223 ++MBBI;
7224
7225 MF.insert(MBBI, LoopBB);
7226 MF.insert(MBBI, BodyBB);
7227 MF.insert(MBBI, RemainderBB);
7228
7229 LoopBB->addSuccessor(BodyBB);
7230 BodyBB->addSuccessor(LoopBB);
7231 BodyBB->addSuccessor(RemainderBB);
7232
7233 // Move Begin to MI to the BodyBB, and the remainder of the block to
7234 // RemainderBB.
7235 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7236 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7237 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7238
7239 MBB.addSuccessor(LoopBB);
7240
7241 // Update dominators. We know that MBB immediately dominates LoopBB, that
7242 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7243 // RemainderBB. RemainderBB immediately dominates all of the successors
7244 // transferred to it from MBB that MBB used to properly dominate.
7245 if (MDT) {
7246 MDT->addNewBlock(LoopBB, &MBB);
7247 MDT->addNewBlock(BodyBB, LoopBB);
7248 MDT->addNewBlock(RemainderBB, BodyBB);
7249 for (auto &Succ : RemainderBB->successors()) {
7250 if (MDT->properlyDominates(&MBB, Succ)) {
7251 MDT->changeImmediateDominator(Succ, RemainderBB);
7252 }
7253 }
7254 }
7255
7256 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7257
7258 MachineBasicBlock::iterator First = RemainderBB->begin();
7259 // Restore SCC
7260 if (SCCNotDead) {
7261 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7262 .addReg(SaveSCCReg, RegState::Kill)
7263 .addImm(0);
7264 }
7265
7266 // Restore the EXEC mask
7267 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7268 .addReg(SaveExec);
7269 return BodyBB;
7270}
7271
7272// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7273static std::tuple<unsigned, unsigned>
7275 MachineBasicBlock &MBB = *MI.getParent();
7276 MachineFunction &MF = *MBB.getParent();
7278
7279 // Extract the ptr from the resource descriptor.
7280 unsigned RsrcPtr =
7281 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7282 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7283
7284 // Create an empty resource descriptor
7285 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7286 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7287 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7288 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7289 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7290
7291 // Zero64 = 0
7292 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7293 .addImm(0);
7294
7295 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7296 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7297 .addImm(Lo_32(RsrcDataFormat));
7298
7299 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7300 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7301 .addImm(Hi_32(RsrcDataFormat));
7302
7303 // NewSRsrc = {Zero64, SRsrcFormat}
7304 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7305 .addReg(Zero64)
7306 .addImm(AMDGPU::sub0_sub1)
7307 .addReg(SRsrcFormatLo)
7308 .addImm(AMDGPU::sub2)
7309 .addReg(SRsrcFormatHi)
7310 .addImm(AMDGPU::sub3);
7311
7312 return std::tuple(RsrcPtr, NewSRsrc);
7313}
7314
7317 MachineDominatorTree *MDT) const {
7318 MachineFunction &MF = *MI.getMF();
7320 MachineBasicBlock *CreatedBB = nullptr;
7321
7322 // Legalize VOP2
7323 if (isVOP2(MI) || isVOPC(MI)) {
7325 return CreatedBB;
7326 }
7327
7328 // Legalize VOP3
7329 if (isVOP3(MI)) {
7331 return CreatedBB;
7332 }
7333
7334 // Legalize SMRD
7335 if (isSMRD(MI)) {
7337 return CreatedBB;
7338 }
7339
7340 // Legalize FLAT
7341 if (isFLAT(MI)) {
7343 return CreatedBB;
7344 }
7345
7346 // Legalize PHI
7347 // The register class of the operands must be the same type as the register
7348 // class of the output.
7349 if (MI.getOpcode() == AMDGPU::PHI) {
7350 const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
7351 assert(!RI.isSGPRClass(VRC));
7352
7353 // Update all the operands so they have the same type.
7354 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7355 MachineOperand &Op = MI.getOperand(I);
7356 if (!Op.isReg() || !Op.getReg().isVirtual())
7357 continue;
7358
7359 // MI is a PHI instruction.
7360 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7362
7363 // Avoid creating no-op copies with the same src and dst reg class. These
7364 // confuse some of the machine passes.
7365 legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
7366 }
7367 }
7368
7369 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7370 // VGPR dest type and SGPR sources, insert copies so all operands are
7371 // VGPRs. This seems to help operand folding / the register coalescer.
7372 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7373 MachineBasicBlock *MBB = MI.getParent();
7374 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7375 if (RI.hasVGPRs(DstRC)) {
7376 // Update all the operands so they are VGPR register classes. These may
7377 // not be the same register class because REG_SEQUENCE supports mixing
7378 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7379 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7380 MachineOperand &Op = MI.getOperand(I);
7381 if (!Op.isReg() || !Op.getReg().isVirtual())
7382 continue;
7383
7384 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7385 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7386 if (VRC == OpRC)
7387 continue;
7388
7389 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7390 Op.setIsKill();
7391 }
7392 }
7393
7394 return CreatedBB;
7395 }
7396
7397 // Legalize INSERT_SUBREG
7398 // src0 must have the same register class as dst
7399 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7400 Register Dst = MI.getOperand(0).getReg();
7401 Register Src0 = MI.getOperand(1).getReg();
7402 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7403 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7404 if (DstRC != Src0RC) {
7405 MachineBasicBlock *MBB = MI.getParent();
7406 MachineOperand &Op = MI.getOperand(1);
7407 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7408 }
7409 return CreatedBB;
7410 }
7411
7412 // Legalize SI_INIT_M0
7413 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7414 MachineOperand &Src = MI.getOperand(0);
7415 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7416 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7417 return CreatedBB;
7418 }
7419
7420 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7421 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7422 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7423 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7424 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7425 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7426 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7427 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7428 MachineOperand &Src = MI.getOperand(1);
7429 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7430 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7431 return CreatedBB;
7432 }
7433
7434 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7435 //
7436 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7437 // scratch memory access. In both cases, the legalization never involves
7438 // conversion to the addr64 form.
7440 (isMUBUF(MI) || isMTBUF(MI)))) {
7441 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7442 ? AMDGPU::OpName::rsrc
7443 : AMDGPU::OpName::srsrc;
7444 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7445 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7446 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7447
7448 AMDGPU::OpName SampOpName =
7449 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7450 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7451 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7452 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7453
7454 return CreatedBB;
7455 }
7456
7457 // Legalize SI_CALL
7458 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7459 MachineOperand *Dest = &MI.getOperand(0);
7460 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7461 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7462 // following copies, we also need to move copies from and to physical
7463 // registers into the loop block.
7464 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7465 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7466
7467 // Also move the copies to physical registers into the loop block
7468 MachineBasicBlock &MBB = *MI.getParent();
7470 while (Start->getOpcode() != FrameSetupOpcode)
7471 --Start;
7473 while (End->getOpcode() != FrameDestroyOpcode)
7474 ++End;
7475 // Also include following copies of the return value
7476 ++End;
7477 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7478 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7479 ++End;
7480 CreatedBB =
7481 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7482 }
7483 }
7484
7485 // Legalize s_sleep_var.
7486 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7487 const DebugLoc &DL = MI.getDebugLoc();
7488 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7489 int Src0Idx =
7490 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7491 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7492 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7493 .add(Src0);
7494 Src0.ChangeToRegister(Reg, false);
7495 return nullptr;
7496 }
7497
7498 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7499 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7500 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7501 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7502 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7503 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7504 for (MachineOperand &Src : MI.explicit_operands()) {
7505 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7506 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7507 }
7508 return CreatedBB;
7509 }
7510
7511 // Legalize MUBUF instructions.
7512 bool isSoffsetLegal = true;
7513 int SoffsetIdx =
7514 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7515 if (SoffsetIdx != -1) {
7516 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7517 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7518 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7519 isSoffsetLegal = false;
7520 }
7521 }
7522
7523 bool isRsrcLegal = true;
7524 int RsrcIdx =
7525 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7526 if (RsrcIdx != -1) {
7527 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7528 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7529 isRsrcLegal = false;
7530 }
7531
7532 // The operands are legal.
7533 if (isRsrcLegal && isSoffsetLegal)
7534 return CreatedBB;
7535
7536 if (!isRsrcLegal) {
7537 // Legalize a VGPR Rsrc
7538 //
7539 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7540 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7541 // a zero-value SRsrc.
7542 //
7543 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7544 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7545 // above.
7546 //
7547 // Otherwise we are on non-ADDR64 hardware, and/or we have
7548 // idxen/offen/bothen and we fall back to a waterfall loop.
7549
7550 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7551 MachineBasicBlock &MBB = *MI.getParent();
7552
7553 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7554 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7555 // This is already an ADDR64 instruction so we need to add the pointer
7556 // extracted from the resource descriptor to the current value of VAddr.
7557 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7558 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7559 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7560
7561 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7562 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7563 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7564
7565 unsigned RsrcPtr, NewSRsrc;
7566 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7567
7568 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7569 const DebugLoc &DL = MI.getDebugLoc();
7570 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7571 .addDef(CondReg0)
7572 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7573 .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
7574 .addImm(0);
7575
7576 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7577 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7578 .addDef(CondReg1, RegState::Dead)
7579 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7580 .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
7581 .addReg(CondReg0, RegState::Kill)
7582 .addImm(0);
7583
7584 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7585 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7586 .addReg(NewVAddrLo)
7587 .addImm(AMDGPU::sub0)
7588 .addReg(NewVAddrHi)
7589 .addImm(AMDGPU::sub1);
7590
7591 VAddr->setReg(NewVAddr);
7592 Rsrc->setReg(NewSRsrc);
7593 } else if (!VAddr && ST.hasAddr64()) {
7594 // This instructions is the _OFFSET variant, so we need to convert it to
7595 // ADDR64.
7596 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7597 "FIXME: Need to emit flat atomics here");
7598
7599 unsigned RsrcPtr, NewSRsrc;
7600 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7601
7602 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7603 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7604 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7605 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7606 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7607
7608 // Atomics with return have an additional tied operand and are
7609 // missing some of the special bits.
7610 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7611 MachineInstr *Addr64;
7612
7613 if (!VDataIn) {
7614 // Regular buffer load / store.
7616 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7617 .add(*VData)
7618 .addReg(NewVAddr)
7619 .addReg(NewSRsrc)
7620 .add(*SOffset)
7621 .add(*Offset);
7622
7623 if (const MachineOperand *CPol =
7624 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7625 MIB.addImm(CPol->getImm());
7626 }
7627
7628 if (const MachineOperand *TFE =
7629 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7630 MIB.addImm(TFE->getImm());
7631 }
7632
7633 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7634
7635 MIB.cloneMemRefs(MI);
7636 Addr64 = MIB;
7637 } else {
7638 // Atomics with return.
7639 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7640 .add(*VData)
7641 .add(*VDataIn)
7642 .addReg(NewVAddr)
7643 .addReg(NewSRsrc)
7644 .add(*SOffset)
7645 .add(*Offset)
7646 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7647 .cloneMemRefs(MI);
7648 }
7649
7650 MI.removeFromParent();
7651
7652 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7653 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7654 NewVAddr)
7655 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7656 .addImm(AMDGPU::sub0)
7657 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7658 .addImm(AMDGPU::sub1);
7659 } else {
7660 // Legalize a VGPR Rsrc and soffset together.
7661 if (!isSoffsetLegal) {
7662 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7663 CreatedBB =
7664 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7665 return CreatedBB;
7666 }
7667 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7668 return CreatedBB;
7669 }
7670 }
7671
7672 // Legalize a VGPR soffset.
7673 if (!isSoffsetLegal) {
7674 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7675 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7676 return CreatedBB;
7677 }
7678 return CreatedBB;
7679}
7680
7682 InstrList.insert(MI);
7683 // Add MBUF instructiosn to deferred list.
7684 int RsrcIdx =
7685 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7686 if (RsrcIdx != -1) {
7687 DeferredList.insert(MI);
7688 }
7689}
7690
7692 return DeferredList.contains(MI);
7693}
7694
7695// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7696// lowering (change spgr to vgpr).
7697// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7698// size. Need to legalize the size of the operands during the vgpr lowering
7699// chain. This can be removed after we have sgpr16 in place
7701 MachineRegisterInfo &MRI) const {
7702 if (!ST.useRealTrue16Insts())
7703 return;
7704
7705 unsigned Opcode = MI.getOpcode();
7706 MachineBasicBlock *MBB = MI.getParent();
7707 // Legalize operands and check for size mismatch
7708 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7709 OpIdx >= get(Opcode).getNumOperands() ||
7710 get(Opcode).operands()[OpIdx].RegClass == -1)
7711 return;
7712
7713 MachineOperand &Op = MI.getOperand(OpIdx);
7714 if (!Op.isReg() || !Op.getReg().isVirtual())
7715 return;
7716
7717 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7718 if (!RI.isVGPRClass(CurrRC))
7719 return;
7720
7721 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7722 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7723 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7724 Op.setSubReg(AMDGPU::lo16);
7725 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7726 const DebugLoc &DL = MI.getDebugLoc();
7727 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7728 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7729 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7730 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7731 .addReg(Op.getReg())
7732 .addImm(AMDGPU::lo16)
7733 .addReg(Undef)
7734 .addImm(AMDGPU::hi16);
7735 Op.setReg(NewDstReg);
7736 }
7737}
7739 MachineRegisterInfo &MRI) const {
7740 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7742}
7743
7745 MachineDominatorTree *MDT) const {
7746
7747 while (!Worklist.empty()) {
7748 MachineInstr &Inst = *Worklist.top();
7749 Worklist.erase_top();
7750 // Skip MachineInstr in the deferred list.
7751 if (Worklist.isDeferred(&Inst))
7752 continue;
7753 moveToVALUImpl(Worklist, MDT, Inst);
7754 }
7755
7756 // Deferred list of instructions will be processed once
7757 // all the MachineInstr in the worklist are done.
7758 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7759 moveToVALUImpl(Worklist, MDT, *Inst);
7760 assert(Worklist.empty() &&
7761 "Deferred MachineInstr are not supposed to re-populate worklist");
7762 }
7763}
7764
7767 MachineInstr &Inst) const {
7768
7770 if (!MBB)
7771 return;
7772 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7773 unsigned Opcode = Inst.getOpcode();
7774 unsigned NewOpcode = getVALUOp(Inst);
7775 const DebugLoc &DL = Inst.getDebugLoc();
7776
7777 // Handle some special cases
7778 switch (Opcode) {
7779 default:
7780 break;
7781 case AMDGPU::S_ADD_I32:
7782 case AMDGPU::S_SUB_I32: {
7783 // FIXME: The u32 versions currently selected use the carry.
7784 bool Changed;
7785 MachineBasicBlock *CreatedBBTmp = nullptr;
7786 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7787 if (Changed)
7788 return;
7789
7790 // Default handling
7791 break;
7792 }
7793
7794 case AMDGPU::S_MUL_U64:
7795 if (ST.hasVectorMulU64()) {
7796 NewOpcode = AMDGPU::V_MUL_U64_e64;
7797 break;
7798 }
7799 // Split s_mul_u64 in 32-bit vector multiplications.
7800 splitScalarSMulU64(Worklist, Inst, MDT);
7801 Inst.eraseFromParent();
7802 return;
7803
7804 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7805 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7806 // This is a special case of s_mul_u64 where all the operands are either
7807 // zero extended or sign extended.
7808 splitScalarSMulPseudo(Worklist, Inst, MDT);
7809 Inst.eraseFromParent();
7810 return;
7811
7812 case AMDGPU::S_AND_B64:
7813 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7814 Inst.eraseFromParent();
7815 return;
7816
7817 case AMDGPU::S_OR_B64:
7818 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7819 Inst.eraseFromParent();
7820 return;
7821
7822 case AMDGPU::S_XOR_B64:
7823 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7824 Inst.eraseFromParent();
7825 return;
7826
7827 case AMDGPU::S_NAND_B64:
7828 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7829 Inst.eraseFromParent();
7830 return;
7831
7832 case AMDGPU::S_NOR_B64:
7833 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7834 Inst.eraseFromParent();
7835 return;
7836
7837 case AMDGPU::S_XNOR_B64:
7838 if (ST.hasDLInsts())
7839 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7840 else
7841 splitScalar64BitXnor(Worklist, Inst, MDT);
7842 Inst.eraseFromParent();
7843 return;
7844
7845 case AMDGPU::S_ANDN2_B64:
7846 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7847 Inst.eraseFromParent();
7848 return;
7849
7850 case AMDGPU::S_ORN2_B64:
7851 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7852 Inst.eraseFromParent();
7853 return;
7854
7855 case AMDGPU::S_BREV_B64:
7856 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7857 Inst.eraseFromParent();
7858 return;
7859
7860 case AMDGPU::S_NOT_B64:
7861 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7862 Inst.eraseFromParent();
7863 return;
7864
7865 case AMDGPU::S_BCNT1_I32_B64:
7866 splitScalar64BitBCNT(Worklist, Inst);
7867 Inst.eraseFromParent();
7868 return;
7869
7870 case AMDGPU::S_BFE_I64:
7871 splitScalar64BitBFE(Worklist, Inst);
7872 Inst.eraseFromParent();
7873 return;
7874
7875 case AMDGPU::S_FLBIT_I32_B64:
7876 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7877 Inst.eraseFromParent();
7878 return;
7879 case AMDGPU::S_FF1_I32_B64:
7880 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7881 Inst.eraseFromParent();
7882 return;
7883
7884 case AMDGPU::S_LSHL_B32:
7885 if (ST.hasOnlyRevVALUShifts()) {
7886 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7887 swapOperands(Inst);
7888 }
7889 break;
7890 case AMDGPU::S_ASHR_I32:
7891 if (ST.hasOnlyRevVALUShifts()) {
7892 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7893 swapOperands(Inst);
7894 }
7895 break;
7896 case AMDGPU::S_LSHR_B32:
7897 if (ST.hasOnlyRevVALUShifts()) {
7898 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7899 swapOperands(Inst);
7900 }
7901 break;
7902 case AMDGPU::S_LSHL_B64:
7903 if (ST.hasOnlyRevVALUShifts()) {
7904 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7905 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7906 : AMDGPU::V_LSHLREV_B64_e64;
7907 swapOperands(Inst);
7908 }
7909 break;
7910 case AMDGPU::S_ASHR_I64:
7911 if (ST.hasOnlyRevVALUShifts()) {
7912 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7913 swapOperands(Inst);
7914 }
7915 break;
7916 case AMDGPU::S_LSHR_B64:
7917 if (ST.hasOnlyRevVALUShifts()) {
7918 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7919 swapOperands(Inst);
7920 }
7921 break;
7922
7923 case AMDGPU::S_ABS_I32:
7924 lowerScalarAbs(Worklist, Inst);
7925 Inst.eraseFromParent();
7926 return;
7927
7928 case AMDGPU::S_ABSDIFF_I32:
7929 lowerScalarAbsDiff(Worklist, Inst);
7930 Inst.eraseFromParent();
7931 return;
7932
7933 case AMDGPU::S_CBRANCH_SCC0:
7934 case AMDGPU::S_CBRANCH_SCC1: {
7935 // Clear unused bits of vcc
7936 Register CondReg = Inst.getOperand(1).getReg();
7937 bool IsSCC = CondReg == AMDGPU::SCC;
7939 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7940 .addReg(LMC.ExecReg)
7941 .addReg(IsSCC ? LMC.VccReg : CondReg);
7942 Inst.removeOperand(1);
7943 } break;
7944
7945 case AMDGPU::S_BFE_U64:
7946 case AMDGPU::S_BFM_B64:
7947 llvm_unreachable("Moving this op to VALU not implemented");
7948
7949 case AMDGPU::S_PACK_LL_B32_B16:
7950 case AMDGPU::S_PACK_LH_B32_B16:
7951 case AMDGPU::S_PACK_HL_B32_B16:
7952 case AMDGPU::S_PACK_HH_B32_B16:
7953 movePackToVALU(Worklist, MRI, Inst);
7954 Inst.eraseFromParent();
7955 return;
7956
7957 case AMDGPU::S_XNOR_B32:
7958 lowerScalarXnor(Worklist, Inst);
7959 Inst.eraseFromParent();
7960 return;
7961
7962 case AMDGPU::S_NAND_B32:
7963 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7964 Inst.eraseFromParent();
7965 return;
7966
7967 case AMDGPU::S_NOR_B32:
7968 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7969 Inst.eraseFromParent();
7970 return;
7971
7972 case AMDGPU::S_ANDN2_B32:
7973 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7974 Inst.eraseFromParent();
7975 return;
7976
7977 case AMDGPU::S_ORN2_B32:
7978 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7979 Inst.eraseFromParent();
7980 return;
7981
7982 // TODO: remove as soon as everything is ready
7983 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7984 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7985 // can only be selected from the uniform SDNode.
7986 case AMDGPU::S_ADD_CO_PSEUDO:
7987 case AMDGPU::S_SUB_CO_PSEUDO: {
7988 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7989 ? AMDGPU::V_ADDC_U32_e64
7990 : AMDGPU::V_SUBB_U32_e64;
7991 const auto *CarryRC = RI.getWaveMaskRegClass();
7992
7993 Register CarryInReg = Inst.getOperand(4).getReg();
7994 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7995 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7996 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7997 .addReg(CarryInReg);
7998 }
7999
8000 Register CarryOutReg = Inst.getOperand(1).getReg();
8001
8002 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8003 MRI.getRegClass(Inst.getOperand(0).getReg())));
8004 MachineInstr *CarryOp =
8005 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8006 .addReg(CarryOutReg, RegState::Define)
8007 .add(Inst.getOperand(2))
8008 .add(Inst.getOperand(3))
8009 .addReg(CarryInReg)
8010 .addImm(0);
8011 legalizeOperands(*CarryOp);
8012 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8013 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8014 Inst.eraseFromParent();
8015 }
8016 return;
8017 case AMDGPU::S_UADDO_PSEUDO:
8018 case AMDGPU::S_USUBO_PSEUDO: {
8019 MachineOperand &Dest0 = Inst.getOperand(0);
8020 MachineOperand &Dest1 = Inst.getOperand(1);
8021 MachineOperand &Src0 = Inst.getOperand(2);
8022 MachineOperand &Src1 = Inst.getOperand(3);
8023
8024 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8025 ? AMDGPU::V_ADD_CO_U32_e64
8026 : AMDGPU::V_SUB_CO_U32_e64;
8027 const TargetRegisterClass *NewRC =
8028 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8029 Register DestReg = MRI.createVirtualRegister(NewRC);
8030 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8031 .addReg(Dest1.getReg(), RegState::Define)
8032 .add(Src0)
8033 .add(Src1)
8034 .addImm(0); // clamp bit
8035
8036 legalizeOperands(*NewInstr, MDT);
8037 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8038 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8039 Inst.eraseFromParent();
8040 }
8041 return;
8042 case AMDGPU::S_LSHL1_ADD_U32:
8043 case AMDGPU::S_LSHL2_ADD_U32:
8044 case AMDGPU::S_LSHL3_ADD_U32:
8045 case AMDGPU::S_LSHL4_ADD_U32: {
8046 MachineOperand &Dest = Inst.getOperand(0);
8047 MachineOperand &Src0 = Inst.getOperand(1);
8048 MachineOperand &Src1 = Inst.getOperand(2);
8049 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8050 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8051 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8052 : 4);
8053
8054 const TargetRegisterClass *NewRC =
8055 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8056 Register DestReg = MRI.createVirtualRegister(NewRC);
8057 MachineInstr *NewInstr =
8058 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8059 .add(Src0)
8060 .addImm(ShiftAmt)
8061 .add(Src1);
8062
8063 legalizeOperands(*NewInstr, MDT);
8064 MRI.replaceRegWith(Dest.getReg(), DestReg);
8065 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8066 Inst.eraseFromParent();
8067 }
8068 return;
8069 case AMDGPU::S_CSELECT_B32:
8070 case AMDGPU::S_CSELECT_B64:
8071 lowerSelect(Worklist, Inst, MDT);
8072 Inst.eraseFromParent();
8073 return;
8074 case AMDGPU::S_CMP_EQ_I32:
8075 case AMDGPU::S_CMP_LG_I32:
8076 case AMDGPU::S_CMP_GT_I32:
8077 case AMDGPU::S_CMP_GE_I32:
8078 case AMDGPU::S_CMP_LT_I32:
8079 case AMDGPU::S_CMP_LE_I32:
8080 case AMDGPU::S_CMP_EQ_U32:
8081 case AMDGPU::S_CMP_LG_U32:
8082 case AMDGPU::S_CMP_GT_U32:
8083 case AMDGPU::S_CMP_GE_U32:
8084 case AMDGPU::S_CMP_LT_U32:
8085 case AMDGPU::S_CMP_LE_U32:
8086 case AMDGPU::S_CMP_EQ_U64:
8087 case AMDGPU::S_CMP_LG_U64:
8088 case AMDGPU::S_CMP_LT_F32:
8089 case AMDGPU::S_CMP_EQ_F32:
8090 case AMDGPU::S_CMP_LE_F32:
8091 case AMDGPU::S_CMP_GT_F32:
8092 case AMDGPU::S_CMP_LG_F32:
8093 case AMDGPU::S_CMP_GE_F32:
8094 case AMDGPU::S_CMP_O_F32:
8095 case AMDGPU::S_CMP_U_F32:
8096 case AMDGPU::S_CMP_NGE_F32:
8097 case AMDGPU::S_CMP_NLG_F32:
8098 case AMDGPU::S_CMP_NGT_F32:
8099 case AMDGPU::S_CMP_NLE_F32:
8100 case AMDGPU::S_CMP_NEQ_F32:
8101 case AMDGPU::S_CMP_NLT_F32: {
8102 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8103 auto NewInstr =
8104 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8105 .setMIFlags(Inst.getFlags());
8106 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8107 0) {
8108 NewInstr
8109 .addImm(0) // src0_modifiers
8110 .add(Inst.getOperand(0)) // src0
8111 .addImm(0) // src1_modifiers
8112 .add(Inst.getOperand(1)) // src1
8113 .addImm(0); // clamp
8114 } else {
8115 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8116 }
8117 legalizeOperands(*NewInstr, MDT);
8118 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8119 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8120 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8121 Inst.eraseFromParent();
8122 return;
8123 }
8124 case AMDGPU::S_CMP_LT_F16:
8125 case AMDGPU::S_CMP_EQ_F16:
8126 case AMDGPU::S_CMP_LE_F16:
8127 case AMDGPU::S_CMP_GT_F16:
8128 case AMDGPU::S_CMP_LG_F16:
8129 case AMDGPU::S_CMP_GE_F16:
8130 case AMDGPU::S_CMP_O_F16:
8131 case AMDGPU::S_CMP_U_F16:
8132 case AMDGPU::S_CMP_NGE_F16:
8133 case AMDGPU::S_CMP_NLG_F16:
8134 case AMDGPU::S_CMP_NGT_F16:
8135 case AMDGPU::S_CMP_NLE_F16:
8136 case AMDGPU::S_CMP_NEQ_F16:
8137 case AMDGPU::S_CMP_NLT_F16: {
8138 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8139 auto NewInstr =
8140 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8141 .setMIFlags(Inst.getFlags());
8142 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8143 NewInstr
8144 .addImm(0) // src0_modifiers
8145 .add(Inst.getOperand(0)) // src0
8146 .addImm(0) // src1_modifiers
8147 .add(Inst.getOperand(1)) // src1
8148 .addImm(0); // clamp
8149 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8150 NewInstr.addImm(0); // op_sel0
8151 } else {
8152 NewInstr
8153 .add(Inst.getOperand(0))
8154 .add(Inst.getOperand(1));
8155 }
8156 legalizeOperandsVALUt16(*NewInstr, MRI);
8157 legalizeOperands(*NewInstr, MDT);
8158 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8159 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8160 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8161 Inst.eraseFromParent();
8162 return;
8163 }
8164 case AMDGPU::S_CVT_HI_F32_F16: {
8165 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8166 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8167 if (ST.useRealTrue16Insts()) {
8168 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8169 .add(Inst.getOperand(1));
8170 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8171 .addImm(0) // src0_modifiers
8172 .addReg(TmpReg, {}, AMDGPU::hi16)
8173 .addImm(0) // clamp
8174 .addImm(0) // omod
8175 .addImm(0); // op_sel0
8176 } else {
8177 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8178 .addImm(16)
8179 .add(Inst.getOperand(1));
8180 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8181 .addImm(0) // src0_modifiers
8182 .addReg(TmpReg)
8183 .addImm(0) // clamp
8184 .addImm(0); // omod
8185 }
8186
8187 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8188 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8189 Inst.eraseFromParent();
8190 return;
8191 }
8192 case AMDGPU::S_MINIMUM_F32:
8193 case AMDGPU::S_MAXIMUM_F32: {
8194 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8195 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8196 .addImm(0) // src0_modifiers
8197 .add(Inst.getOperand(1))
8198 .addImm(0) // src1_modifiers
8199 .add(Inst.getOperand(2))
8200 .addImm(0) // clamp
8201 .addImm(0); // omod
8202 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8203
8204 legalizeOperands(*NewInstr, MDT);
8205 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8206 Inst.eraseFromParent();
8207 return;
8208 }
8209 case AMDGPU::S_MINIMUM_F16:
8210 case AMDGPU::S_MAXIMUM_F16: {
8211 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8212 ? &AMDGPU::VGPR_16RegClass
8213 : &AMDGPU::VGPR_32RegClass);
8214 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8215 .addImm(0) // src0_modifiers
8216 .add(Inst.getOperand(1))
8217 .addImm(0) // src1_modifiers
8218 .add(Inst.getOperand(2))
8219 .addImm(0) // clamp
8220 .addImm(0) // omod
8221 .addImm(0); // opsel0
8222 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8223 legalizeOperandsVALUt16(*NewInstr, MRI);
8224 legalizeOperands(*NewInstr, MDT);
8225 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8226 Inst.eraseFromParent();
8227 return;
8228 }
8229 case AMDGPU::V_S_EXP_F16_e64:
8230 case AMDGPU::V_S_LOG_F16_e64:
8231 case AMDGPU::V_S_RCP_F16_e64:
8232 case AMDGPU::V_S_RSQ_F16_e64:
8233 case AMDGPU::V_S_SQRT_F16_e64: {
8234 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8235 ? &AMDGPU::VGPR_16RegClass
8236 : &AMDGPU::VGPR_32RegClass);
8237 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8238 .add(Inst.getOperand(1)) // src0_modifiers
8239 .add(Inst.getOperand(2))
8240 .add(Inst.getOperand(3)) // clamp
8241 .add(Inst.getOperand(4)) // omod
8242 .setMIFlags(Inst.getFlags());
8243 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8244 NewInstr.addImm(0); // opsel0
8245 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8246 legalizeOperandsVALUt16(*NewInstr, MRI);
8247 legalizeOperands(*NewInstr, MDT);
8248 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8249 Inst.eraseFromParent();
8250 return;
8251 }
8252 }
8253
8254 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8255 // We cannot move this instruction to the VALU, so we should try to
8256 // legalize its operands instead.
8257 legalizeOperands(Inst, MDT);
8258 return;
8259 }
8260 // Handle converting generic instructions like COPY-to-SGPR into
8261 // COPY-to-VGPR.
8262 if (NewOpcode == Opcode) {
8263 Register DstReg = Inst.getOperand(0).getReg();
8264 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8265
8266 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8267 // hope for the best.
8268 if (Inst.isCopy() && DstReg.isPhysical() &&
8269 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8270 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8271 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8272 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8273 .add(Inst.getOperand(1));
8274 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8275 DstReg)
8276 .addReg(NewDst);
8277
8278 Inst.eraseFromParent();
8279 return;
8280 }
8281
8282 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8283 Register NewDstReg = Inst.getOperand(1).getReg();
8284 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8285 if (const TargetRegisterClass *CommonRC =
8286 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8287 // Instead of creating a copy where src and dst are the same register
8288 // class, we just replace all uses of dst with src. These kinds of
8289 // copies interfere with the heuristics MachineSink uses to decide
8290 // whether or not to split a critical edge. Since the pass assumes
8291 // that copies will end up as machine instructions and not be
8292 // eliminated.
8293 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8294 MRI.replaceRegWith(DstReg, NewDstReg);
8295 MRI.clearKillFlags(NewDstReg);
8296 Inst.getOperand(0).setReg(DstReg);
8297
8298 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8299 llvm_unreachable("failed to constrain register");
8300
8301 Inst.eraseFromParent();
8302 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8303 for (MachineOperand &MO :
8304 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8305 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8306 }
8307
8308 return;
8309 }
8310 }
8311
8312 // If this is a v2s copy between 16bit and 32bit reg,
8313 // replace vgpr copy to reg_sequence/extract_subreg
8314 // This can be remove after we have sgpr16 in place
8315 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8316 Inst.getOperand(1).getReg().isVirtual() &&
8317 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8318 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8319 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8320 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8321 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8322 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8323 get(AMDGPU::IMPLICIT_DEF), Undef);
8324 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8325 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8326 .addReg(Inst.getOperand(1).getReg())
8327 .addImm(AMDGPU::lo16)
8328 .addReg(Undef)
8329 .addImm(AMDGPU::hi16);
8330 Inst.eraseFromParent();
8331 MRI.replaceRegWith(DstReg, NewDstReg);
8332 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8333 return;
8334 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8335 AMDGPU::lo16)) {
8336 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8337 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8338 MRI.replaceRegWith(DstReg, NewDstReg);
8339 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8340 return;
8341 }
8342 }
8343
8344 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8345 MRI.replaceRegWith(DstReg, NewDstReg);
8346 legalizeOperands(Inst, MDT);
8347 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8348 return;
8349 }
8350
8351 // Use the new VALU Opcode.
8352 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8353 .setMIFlags(Inst.getFlags());
8354 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8355 // Intersperse VOP3 modifiers among the SALU operands.
8356 NewInstr->addOperand(Inst.getOperand(0));
8357 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8358 AMDGPU::OpName::src0_modifiers) >= 0)
8359 NewInstr.addImm(0);
8360 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8361 const MachineOperand &Src = Inst.getOperand(1);
8362 NewInstr->addOperand(Src);
8363 }
8364
8365 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8366 // We are converting these to a BFE, so we need to add the missing
8367 // operands for the size and offset.
8368 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8369 NewInstr.addImm(0);
8370 NewInstr.addImm(Size);
8371 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8372 // The VALU version adds the second operand to the result, so insert an
8373 // extra 0 operand.
8374 NewInstr.addImm(0);
8375 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8376 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8377 // If we need to move this to VGPRs, we need to unpack the second
8378 // operand back into the 2 separate ones for bit offset and width.
8379 assert(OffsetWidthOp.isImm() &&
8380 "Scalar BFE is only implemented for constant width and offset");
8381 uint32_t Imm = OffsetWidthOp.getImm();
8382
8383 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8384 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8385 NewInstr.addImm(Offset);
8386 NewInstr.addImm(BitWidth);
8387 } else {
8388 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8389 AMDGPU::OpName::src1_modifiers) >= 0)
8390 NewInstr.addImm(0);
8391 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8392 NewInstr->addOperand(Inst.getOperand(2));
8393 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8394 AMDGPU::OpName::src2_modifiers) >= 0)
8395 NewInstr.addImm(0);
8396 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8397 NewInstr->addOperand(Inst.getOperand(3));
8398 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8399 NewInstr.addImm(0);
8400 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8401 NewInstr.addImm(0);
8402 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8403 NewInstr.addImm(0);
8404 }
8405 } else {
8406 // Just copy the SALU operands.
8407 for (const MachineOperand &Op : Inst.explicit_operands())
8408 NewInstr->addOperand(Op);
8409 }
8410
8411 // Remove any references to SCC. Vector instructions can't read from it, and
8412 // We're just about to add the implicit use / defs of VCC, and we don't want
8413 // both.
8414 for (MachineOperand &Op : Inst.implicit_operands()) {
8415 if (Op.getReg() == AMDGPU::SCC) {
8416 // Only propagate through live-def of SCC.
8417 if (Op.isDef() && !Op.isDead())
8418 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8419 if (Op.isUse())
8420 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8421 }
8422 }
8423 Inst.eraseFromParent();
8424 Register NewDstReg;
8425 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8426 Register DstReg = NewInstr->getOperand(0).getReg();
8427 assert(DstReg.isVirtual());
8428 // Update the destination register class.
8429 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8430 assert(NewDstRC);
8431 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8432 MRI.replaceRegWith(DstReg, NewDstReg);
8433 }
8434 fixImplicitOperands(*NewInstr);
8435
8436 legalizeOperandsVALUt16(*NewInstr, MRI);
8437
8438 // Legalize the operands
8439 legalizeOperands(*NewInstr, MDT);
8440 if (NewDstReg)
8441 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8442}
8443
8444// Add/sub require special handling to deal with carry outs.
8445std::pair<bool, MachineBasicBlock *>
8446SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8447 MachineDominatorTree *MDT) const {
8448 if (ST.hasAddNoCarryInsts()) {
8449 // Assume there is no user of scc since we don't select this in that case.
8450 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8451 // is used.
8452
8453 MachineBasicBlock &MBB = *Inst.getParent();
8454 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8455
8456 Register OldDstReg = Inst.getOperand(0).getReg();
8457 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8458
8459 unsigned Opc = Inst.getOpcode();
8460 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8461
8462 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8463 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8464
8465 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8466 Inst.removeOperand(3);
8467
8468 Inst.setDesc(get(NewOpc));
8469 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8470 Inst.addImplicitDefUseOperands(*MBB.getParent());
8471 MRI.replaceRegWith(OldDstReg, ResultReg);
8472 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8473
8474 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8475 return std::pair(true, NewBB);
8476 }
8477
8478 return std::pair(false, nullptr);
8479}
8480
8481void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8482 MachineDominatorTree *MDT) const {
8483
8484 MachineBasicBlock &MBB = *Inst.getParent();
8485 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8486 MachineBasicBlock::iterator MII = Inst;
8487 const DebugLoc &DL = Inst.getDebugLoc();
8488
8489 MachineOperand &Dest = Inst.getOperand(0);
8490 MachineOperand &Src0 = Inst.getOperand(1);
8491 MachineOperand &Src1 = Inst.getOperand(2);
8492 MachineOperand &Cond = Inst.getOperand(3);
8493
8494 Register CondReg = Cond.getReg();
8495 bool IsSCC = (CondReg == AMDGPU::SCC);
8496
8497 // If this is a trivial select where the condition is effectively not SCC
8498 // (CondReg is a source of copy to SCC), then the select is semantically
8499 // equivalent to copying CondReg. Hence, there is no need to create
8500 // V_CNDMASK, we can just use that and bail out.
8501 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8502 (Src1.getImm() == 0)) {
8503 MRI.replaceRegWith(Dest.getReg(), CondReg);
8504 return;
8505 }
8506
8507 Register NewCondReg = CondReg;
8508 if (IsSCC) {
8509 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8510 NewCondReg = MRI.createVirtualRegister(TC);
8511
8512 // Now look for the closest SCC def if it is a copy
8513 // replacing the CondReg with the COPY source register
8514 bool CopyFound = false;
8515 for (MachineInstr &CandI :
8517 Inst.getParent()->rend())) {
8518 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8519 -1) {
8520 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8521 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8522 .addReg(CandI.getOperand(1).getReg());
8523 CopyFound = true;
8524 }
8525 break;
8526 }
8527 }
8528 if (!CopyFound) {
8529 // SCC def is not a copy
8530 // Insert a trivial select instead of creating a copy, because a copy from
8531 // SCC would semantically mean just copying a single bit, but we may need
8532 // the result to be a vector condition mask that needs preserving.
8533 unsigned Opcode =
8534 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8535 auto NewSelect =
8536 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8537 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8538 }
8539 }
8540
8541 Register NewDestReg = MRI.createVirtualRegister(
8542 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8543 MachineInstr *NewInst;
8544 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8545 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8546 .addImm(0)
8547 .add(Src1) // False
8548 .addImm(0)
8549 .add(Src0) // True
8550 .addReg(NewCondReg);
8551 } else {
8552 NewInst =
8553 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8554 .add(Src1) // False
8555 .add(Src0) // True
8556 .addReg(NewCondReg);
8557 }
8558 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8559 legalizeOperands(*NewInst, MDT);
8560 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8561}
8562
8563void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8564 MachineInstr &Inst) const {
8565 MachineBasicBlock &MBB = *Inst.getParent();
8566 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8567 MachineBasicBlock::iterator MII = Inst;
8568 const DebugLoc &DL = Inst.getDebugLoc();
8569
8570 MachineOperand &Dest = Inst.getOperand(0);
8571 MachineOperand &Src = Inst.getOperand(1);
8572 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8573 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8574
8575 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8576 : AMDGPU::V_SUB_CO_U32_e32;
8577
8578 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8579 .addImm(0)
8580 .addReg(Src.getReg());
8581
8582 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8583 .addReg(Src.getReg())
8584 .addReg(TmpReg);
8585
8586 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8587 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8588}
8589
8590void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8591 MachineInstr &Inst) const {
8592 MachineBasicBlock &MBB = *Inst.getParent();
8593 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8594 MachineBasicBlock::iterator MII = Inst;
8595 const DebugLoc &DL = Inst.getDebugLoc();
8596
8597 MachineOperand &Dest = Inst.getOperand(0);
8598 MachineOperand &Src1 = Inst.getOperand(1);
8599 MachineOperand &Src2 = Inst.getOperand(2);
8600 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8601 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8602 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8603
8604 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8605 : AMDGPU::V_SUB_CO_U32_e32;
8606
8607 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8608 .addReg(Src1.getReg())
8609 .addReg(Src2.getReg());
8610
8611 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8612
8613 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8614 .addReg(SubResultReg)
8615 .addReg(TmpReg);
8616
8617 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8618 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8619}
8620
8621void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8622 MachineInstr &Inst) const {
8623 MachineBasicBlock &MBB = *Inst.getParent();
8624 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8625 MachineBasicBlock::iterator MII = Inst;
8626 const DebugLoc &DL = Inst.getDebugLoc();
8627
8628 MachineOperand &Dest = Inst.getOperand(0);
8629 MachineOperand &Src0 = Inst.getOperand(1);
8630 MachineOperand &Src1 = Inst.getOperand(2);
8631
8632 if (ST.hasDLInsts()) {
8633 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8634 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8635 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8636
8637 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8638 .add(Src0)
8639 .add(Src1);
8640
8641 MRI.replaceRegWith(Dest.getReg(), NewDest);
8642 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8643 } else {
8644 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8645 // invert either source and then perform the XOR. If either source is a
8646 // scalar register, then we can leave the inversion on the scalar unit to
8647 // achieve a better distribution of scalar and vector instructions.
8648 bool Src0IsSGPR = Src0.isReg() &&
8649 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8650 bool Src1IsSGPR = Src1.isReg() &&
8651 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8652 MachineInstr *Xor;
8653 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8654 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8655
8656 // Build a pair of scalar instructions and add them to the work list.
8657 // The next iteration over the work list will lower these to the vector
8658 // unit as necessary.
8659 if (Src0IsSGPR) {
8660 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8661 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8662 .addReg(Temp)
8663 .add(Src1);
8664 } else if (Src1IsSGPR) {
8665 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8666 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8667 .add(Src0)
8668 .addReg(Temp);
8669 } else {
8670 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8671 .add(Src0)
8672 .add(Src1);
8673 MachineInstr *Not =
8674 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8675 Worklist.insert(Not);
8676 }
8677
8678 MRI.replaceRegWith(Dest.getReg(), NewDest);
8679
8680 Worklist.insert(Xor);
8681
8682 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8683 }
8684}
8685
8686void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8687 MachineInstr &Inst,
8688 unsigned Opcode) const {
8689 MachineBasicBlock &MBB = *Inst.getParent();
8690 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8691 MachineBasicBlock::iterator MII = Inst;
8692 const DebugLoc &DL = Inst.getDebugLoc();
8693
8694 MachineOperand &Dest = Inst.getOperand(0);
8695 MachineOperand &Src0 = Inst.getOperand(1);
8696 MachineOperand &Src1 = Inst.getOperand(2);
8697
8698 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8699 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8700
8701 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8702 .add(Src0)
8703 .add(Src1);
8704
8705 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8706 .addReg(Interm);
8707
8708 Worklist.insert(&Op);
8709 Worklist.insert(&Not);
8710
8711 MRI.replaceRegWith(Dest.getReg(), NewDest);
8712 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8713}
8714
8715void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8716 MachineInstr &Inst,
8717 unsigned Opcode) const {
8718 MachineBasicBlock &MBB = *Inst.getParent();
8719 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8720 MachineBasicBlock::iterator MII = Inst;
8721 const DebugLoc &DL = Inst.getDebugLoc();
8722
8723 MachineOperand &Dest = Inst.getOperand(0);
8724 MachineOperand &Src0 = Inst.getOperand(1);
8725 MachineOperand &Src1 = Inst.getOperand(2);
8726
8727 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8728 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8729
8730 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8731 .add(Src1);
8732
8733 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8734 .add(Src0)
8735 .addReg(Interm);
8736
8737 Worklist.insert(&Not);
8738 Worklist.insert(&Op);
8739
8740 MRI.replaceRegWith(Dest.getReg(), NewDest);
8741 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8742}
8743
8744void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8745 MachineInstr &Inst, unsigned Opcode,
8746 bool Swap) const {
8747 MachineBasicBlock &MBB = *Inst.getParent();
8748 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8749
8750 MachineOperand &Dest = Inst.getOperand(0);
8751 MachineOperand &Src0 = Inst.getOperand(1);
8752 const DebugLoc &DL = Inst.getDebugLoc();
8753
8754 MachineBasicBlock::iterator MII = Inst;
8755
8756 const MCInstrDesc &InstDesc = get(Opcode);
8757 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8758 MRI.getRegClass(Src0.getReg()) :
8759 &AMDGPU::SGPR_32RegClass;
8760
8761 const TargetRegisterClass *Src0SubRC =
8762 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8763
8764 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8765 AMDGPU::sub0, Src0SubRC);
8766
8767 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8768 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8769 const TargetRegisterClass *NewDestSubRC =
8770 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8771
8772 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8773 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8774
8775 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8776 AMDGPU::sub1, Src0SubRC);
8777
8778 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8779 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8780
8781 if (Swap)
8782 std::swap(DestSub0, DestSub1);
8783
8784 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8785 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8786 .addReg(DestSub0)
8787 .addImm(AMDGPU::sub0)
8788 .addReg(DestSub1)
8789 .addImm(AMDGPU::sub1);
8790
8791 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8792
8793 Worklist.insert(&LoHalf);
8794 Worklist.insert(&HiHalf);
8795
8796 // We don't need to legalizeOperands here because for a single operand, src0
8797 // will support any kind of input.
8798
8799 // Move all users of this moved value.
8800 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8801}
8802
8803// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8804// split the s_mul_u64 in 32-bit vector multiplications.
8805void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8806 MachineInstr &Inst,
8807 MachineDominatorTree *MDT) const {
8808 MachineBasicBlock &MBB = *Inst.getParent();
8809 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8810
8811 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8812 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8813 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8814
8815 MachineOperand &Dest = Inst.getOperand(0);
8816 MachineOperand &Src0 = Inst.getOperand(1);
8817 MachineOperand &Src1 = Inst.getOperand(2);
8818 const DebugLoc &DL = Inst.getDebugLoc();
8819 MachineBasicBlock::iterator MII = Inst;
8820
8821 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8822 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8823 const TargetRegisterClass *Src0SubRC =
8824 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8825 if (RI.isSGPRClass(Src0SubRC))
8826 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8827 const TargetRegisterClass *Src1SubRC =
8828 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8829 if (RI.isSGPRClass(Src1SubRC))
8830 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8831
8832 // First, we extract the low 32-bit and high 32-bit values from each of the
8833 // operands.
8834 MachineOperand Op0L =
8835 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8836 MachineOperand Op1L =
8837 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8838 MachineOperand Op0H =
8839 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8840 MachineOperand Op1H =
8841 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8842
8843 // The multilication is done as follows:
8844 //
8845 // Op1H Op1L
8846 // * Op0H Op0L
8847 // --------------------
8848 // Op1H*Op0L Op1L*Op0L
8849 // + Op1H*Op0H Op1L*Op0H
8850 // -----------------------------------------
8851 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8852 //
8853 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8854 // value and that would overflow.
8855 // The low 32-bit value is Op1L*Op0L.
8856 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8857
8858 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8859 MachineInstr *Op1L_Op0H =
8860 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8861 .add(Op1L)
8862 .add(Op0H);
8863
8864 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8865 MachineInstr *Op1H_Op0L =
8866 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8867 .add(Op1H)
8868 .add(Op0L);
8869
8870 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8871 MachineInstr *Carry =
8872 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8873 .add(Op1L)
8874 .add(Op0L);
8875
8876 MachineInstr *LoHalf =
8877 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8878 .add(Op1L)
8879 .add(Op0L);
8880
8881 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8882 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8883 .addReg(Op1L_Op0H_Reg)
8884 .addReg(Op1H_Op0L_Reg);
8885
8886 MachineInstr *HiHalf =
8887 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8888 .addReg(AddReg)
8889 .addReg(CarryReg);
8890
8891 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8892 .addReg(DestSub0)
8893 .addImm(AMDGPU::sub0)
8894 .addReg(DestSub1)
8895 .addImm(AMDGPU::sub1);
8896
8897 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8898
8899 // Try to legalize the operands in case we need to swap the order to keep it
8900 // valid.
8901 legalizeOperands(*Op1L_Op0H, MDT);
8902 legalizeOperands(*Op1H_Op0L, MDT);
8903 legalizeOperands(*Carry, MDT);
8904 legalizeOperands(*LoHalf, MDT);
8905 legalizeOperands(*Add, MDT);
8906 legalizeOperands(*HiHalf, MDT);
8907
8908 // Move all users of this moved value.
8909 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8910}
8911
8912// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8913// multiplications.
8914void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8915 MachineInstr &Inst,
8916 MachineDominatorTree *MDT) const {
8917 MachineBasicBlock &MBB = *Inst.getParent();
8918 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8919
8920 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8921 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8922 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8923
8924 MachineOperand &Dest = Inst.getOperand(0);
8925 MachineOperand &Src0 = Inst.getOperand(1);
8926 MachineOperand &Src1 = Inst.getOperand(2);
8927 const DebugLoc &DL = Inst.getDebugLoc();
8928 MachineBasicBlock::iterator MII = Inst;
8929
8930 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8931 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8932 const TargetRegisterClass *Src0SubRC =
8933 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8934 if (RI.isSGPRClass(Src0SubRC))
8935 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8936 const TargetRegisterClass *Src1SubRC =
8937 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8938 if (RI.isSGPRClass(Src1SubRC))
8939 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8940
8941 // First, we extract the low 32-bit and high 32-bit values from each of the
8942 // operands.
8943 MachineOperand Op0L =
8944 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8945 MachineOperand Op1L =
8946 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8947
8948 unsigned Opc = Inst.getOpcode();
8949 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8950 ? AMDGPU::V_MUL_HI_U32_e64
8951 : AMDGPU::V_MUL_HI_I32_e64;
8952 MachineInstr *HiHalf =
8953 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8954
8955 MachineInstr *LoHalf =
8956 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8957 .add(Op1L)
8958 .add(Op0L);
8959
8960 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8961 .addReg(DestSub0)
8962 .addImm(AMDGPU::sub0)
8963 .addReg(DestSub1)
8964 .addImm(AMDGPU::sub1);
8965
8966 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8967
8968 // Try to legalize the operands in case we need to swap the order to keep it
8969 // valid.
8970 legalizeOperands(*HiHalf, MDT);
8971 legalizeOperands(*LoHalf, MDT);
8972
8973 // Move all users of this moved value.
8974 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8975}
8976
8977void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8978 MachineInstr &Inst, unsigned Opcode,
8979 MachineDominatorTree *MDT) const {
8980 MachineBasicBlock &MBB = *Inst.getParent();
8981 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8982
8983 MachineOperand &Dest = Inst.getOperand(0);
8984 MachineOperand &Src0 = Inst.getOperand(1);
8985 MachineOperand &Src1 = Inst.getOperand(2);
8986 const DebugLoc &DL = Inst.getDebugLoc();
8987
8988 MachineBasicBlock::iterator MII = Inst;
8989
8990 const MCInstrDesc &InstDesc = get(Opcode);
8991 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8992 MRI.getRegClass(Src0.getReg()) :
8993 &AMDGPU::SGPR_32RegClass;
8994
8995 const TargetRegisterClass *Src0SubRC =
8996 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8997 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8998 MRI.getRegClass(Src1.getReg()) :
8999 &AMDGPU::SGPR_32RegClass;
9000
9001 const TargetRegisterClass *Src1SubRC =
9002 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9003
9004 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9005 AMDGPU::sub0, Src0SubRC);
9006 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9007 AMDGPU::sub0, Src1SubRC);
9008 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9009 AMDGPU::sub1, Src0SubRC);
9010 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9011 AMDGPU::sub1, Src1SubRC);
9012
9013 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9014 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9015 const TargetRegisterClass *NewDestSubRC =
9016 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9017
9018 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9019 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9020 .add(SrcReg0Sub0)
9021 .add(SrcReg1Sub0);
9022
9023 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9024 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9025 .add(SrcReg0Sub1)
9026 .add(SrcReg1Sub1);
9027
9028 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9029 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9030 .addReg(DestSub0)
9031 .addImm(AMDGPU::sub0)
9032 .addReg(DestSub1)
9033 .addImm(AMDGPU::sub1);
9034
9035 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9036
9037 Worklist.insert(&LoHalf);
9038 Worklist.insert(&HiHalf);
9039
9040 // Move all users of this moved value.
9041 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9042}
9043
9044void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9045 MachineInstr &Inst,
9046 MachineDominatorTree *MDT) const {
9047 MachineBasicBlock &MBB = *Inst.getParent();
9048 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9049
9050 MachineOperand &Dest = Inst.getOperand(0);
9051 MachineOperand &Src0 = Inst.getOperand(1);
9052 MachineOperand &Src1 = Inst.getOperand(2);
9053 const DebugLoc &DL = Inst.getDebugLoc();
9054
9055 MachineBasicBlock::iterator MII = Inst;
9056
9057 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9058
9059 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9060
9061 MachineOperand* Op0;
9062 MachineOperand* Op1;
9063
9064 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9065 Op0 = &Src0;
9066 Op1 = &Src1;
9067 } else {
9068 Op0 = &Src1;
9069 Op1 = &Src0;
9070 }
9071
9072 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9073 .add(*Op0);
9074
9075 Register NewDest = MRI.createVirtualRegister(DestRC);
9076
9077 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9078 .addReg(Interm)
9079 .add(*Op1);
9080
9081 MRI.replaceRegWith(Dest.getReg(), NewDest);
9082
9083 Worklist.insert(&Xor);
9084}
9085
9086void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9087 MachineInstr &Inst) const {
9088 MachineBasicBlock &MBB = *Inst.getParent();
9089 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9090
9091 MachineBasicBlock::iterator MII = Inst;
9092 const DebugLoc &DL = Inst.getDebugLoc();
9093
9094 MachineOperand &Dest = Inst.getOperand(0);
9095 MachineOperand &Src = Inst.getOperand(1);
9096
9097 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9098 const TargetRegisterClass *SrcRC = Src.isReg() ?
9099 MRI.getRegClass(Src.getReg()) :
9100 &AMDGPU::SGPR_32RegClass;
9101
9102 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9103 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9104
9105 const TargetRegisterClass *SrcSubRC =
9106 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9107
9108 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9109 AMDGPU::sub0, SrcSubRC);
9110 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9111 AMDGPU::sub1, SrcSubRC);
9112
9113 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9114
9115 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9116
9117 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9118
9119 // We don't need to legalize operands here. src0 for either instruction can be
9120 // an SGPR, and the second input is unused or determined here.
9121 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9122}
9123
9124void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9125 MachineInstr &Inst) const {
9126 MachineBasicBlock &MBB = *Inst.getParent();
9127 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9128 MachineBasicBlock::iterator MII = Inst;
9129 const DebugLoc &DL = Inst.getDebugLoc();
9130
9131 MachineOperand &Dest = Inst.getOperand(0);
9132 uint32_t Imm = Inst.getOperand(2).getImm();
9133 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9134 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9135
9136 (void) Offset;
9137
9138 // Only sext_inreg cases handled.
9139 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9140 Offset == 0 && "Not implemented");
9141
9142 if (BitWidth < 32) {
9143 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9144 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9145 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9146
9147 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9148 .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
9149 .addImm(0)
9150 .addImm(BitWidth);
9151
9152 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9153 .addImm(31)
9154 .addReg(MidRegLo);
9155
9156 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9157 .addReg(MidRegLo)
9158 .addImm(AMDGPU::sub0)
9159 .addReg(MidRegHi)
9160 .addImm(AMDGPU::sub1);
9161
9162 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9163 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9164 return;
9165 }
9166
9167 MachineOperand &Src = Inst.getOperand(1);
9168 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9169 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9170
9171 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9172 .addImm(31)
9173 .addReg(Src.getReg(), {}, AMDGPU::sub0);
9174
9175 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9176 .addReg(Src.getReg(), {}, AMDGPU::sub0)
9177 .addImm(AMDGPU::sub0)
9178 .addReg(TmpReg)
9179 .addImm(AMDGPU::sub1);
9180
9181 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9182 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9183}
9184
9185void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9186 MachineInstr &Inst, unsigned Opcode,
9187 MachineDominatorTree *MDT) const {
9188 // (S_FLBIT_I32_B64 hi:lo) ->
9189 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9190 // (S_FF1_I32_B64 hi:lo) ->
9191 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9192
9193 MachineBasicBlock &MBB = *Inst.getParent();
9194 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9195 MachineBasicBlock::iterator MII = Inst;
9196 const DebugLoc &DL = Inst.getDebugLoc();
9197
9198 MachineOperand &Dest = Inst.getOperand(0);
9199 MachineOperand &Src = Inst.getOperand(1);
9200
9201 const MCInstrDesc &InstDesc = get(Opcode);
9202
9203 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9204 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9205 : AMDGPU::V_ADD_CO_U32_e32;
9206
9207 const TargetRegisterClass *SrcRC =
9208 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9209 const TargetRegisterClass *SrcSubRC =
9210 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9211
9212 MachineOperand SrcRegSub0 =
9213 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9214 MachineOperand SrcRegSub1 =
9215 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9216
9217 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9218 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9219 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9220 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9221
9222 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9223
9224 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9225
9226 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9227 .addReg(IsCtlz ? MidReg1 : MidReg2)
9228 .addImm(32)
9229 .addImm(1); // enable clamp
9230
9231 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9232 .addReg(MidReg3)
9233 .addReg(IsCtlz ? MidReg2 : MidReg1);
9234
9235 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9236
9237 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9238}
9239
9240void SIInstrInfo::addUsersToMoveToVALUWorklist(
9242 SIInstrWorklist &Worklist) const {
9243 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9244 MachineInstr &UseMI = *MO.getParent();
9245
9246 unsigned OpNo = 0;
9247
9248 switch (UseMI.getOpcode()) {
9249 case AMDGPU::COPY:
9250 case AMDGPU::WQM:
9251 case AMDGPU::SOFT_WQM:
9252 case AMDGPU::STRICT_WWM:
9253 case AMDGPU::STRICT_WQM:
9254 case AMDGPU::REG_SEQUENCE:
9255 case AMDGPU::PHI:
9256 case AMDGPU::INSERT_SUBREG:
9257 break;
9258 default:
9259 OpNo = MO.getOperandNo();
9260 break;
9261 }
9262
9263 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9264 MRI.constrainRegClass(DstReg, OpRC);
9265
9266 if (!RI.hasVectorRegisters(OpRC))
9267 Worklist.insert(&UseMI);
9268 else
9269 // Legalization could change user list.
9271 }
9272}
9273
9274void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9276 MachineInstr &Inst) const {
9277 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9278 MachineBasicBlock *MBB = Inst.getParent();
9279 MachineOperand &Src0 = Inst.getOperand(1);
9280 MachineOperand &Src1 = Inst.getOperand(2);
9281 const DebugLoc &DL = Inst.getDebugLoc();
9282
9283 if (ST.useRealTrue16Insts()) {
9284 Register SrcReg0, SrcReg1;
9285 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9286 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9287 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
9288 } else {
9289 SrcReg0 = Src0.getReg();
9290 }
9291
9292 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9293 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9294 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
9295 } else {
9296 SrcReg1 = Src1.getReg();
9297 }
9298
9299 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9300 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9301
9302 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9303 switch (Inst.getOpcode()) {
9304 case AMDGPU::S_PACK_LL_B32_B16:
9305 NewMI
9306 .addReg(SrcReg0, {},
9307 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9308 .addImm(AMDGPU::lo16)
9309 .addReg(SrcReg1, {},
9310 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9311 .addImm(AMDGPU::hi16);
9312 break;
9313 case AMDGPU::S_PACK_LH_B32_B16:
9314 NewMI
9315 .addReg(SrcReg0, {},
9316 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9317 .addImm(AMDGPU::lo16)
9318 .addReg(SrcReg1, {}, AMDGPU::hi16)
9319 .addImm(AMDGPU::hi16);
9320 break;
9321 case AMDGPU::S_PACK_HL_B32_B16:
9322 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9323 .addImm(AMDGPU::lo16)
9324 .addReg(SrcReg1, {},
9325 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9326 .addImm(AMDGPU::hi16);
9327 break;
9328 case AMDGPU::S_PACK_HH_B32_B16:
9329 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9330 .addImm(AMDGPU::lo16)
9331 .addReg(SrcReg1, {}, AMDGPU::hi16)
9332 .addImm(AMDGPU::hi16);
9333 break;
9334 default:
9335 llvm_unreachable("unhandled s_pack_* instruction");
9336 }
9337
9338 MachineOperand &Dest = Inst.getOperand(0);
9339 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9340 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9341 return;
9342 }
9343
9344 switch (Inst.getOpcode()) {
9345 case AMDGPU::S_PACK_LL_B32_B16: {
9346 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9347 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9348
9349 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9350 // 0.
9351 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9352 .addImm(0xffff);
9353
9354 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9355 .addReg(ImmReg, RegState::Kill)
9356 .add(Src0);
9357
9358 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9359 .add(Src1)
9360 .addImm(16)
9361 .addReg(TmpReg, RegState::Kill);
9362 break;
9363 }
9364 case AMDGPU::S_PACK_LH_B32_B16: {
9365 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9366 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9367 .addImm(0xffff);
9368 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9369 .addReg(ImmReg, RegState::Kill)
9370 .add(Src0)
9371 .add(Src1);
9372 break;
9373 }
9374 case AMDGPU::S_PACK_HL_B32_B16: {
9375 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9376 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9377 .addImm(16)
9378 .add(Src0);
9379 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9380 .add(Src1)
9381 .addImm(16)
9382 .addReg(TmpReg, RegState::Kill);
9383 break;
9384 }
9385 case AMDGPU::S_PACK_HH_B32_B16: {
9386 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9387 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9388 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9389 .addImm(16)
9390 .add(Src0);
9391 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9392 .addImm(0xffff0000);
9393 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9394 .add(Src1)
9395 .addReg(ImmReg, RegState::Kill)
9396 .addReg(TmpReg, RegState::Kill);
9397 break;
9398 }
9399 default:
9400 llvm_unreachable("unhandled s_pack_* instruction");
9401 }
9402
9403 MachineOperand &Dest = Inst.getOperand(0);
9404 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9405 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9406}
9407
9408void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9409 MachineInstr &SCCDefInst,
9410 SIInstrWorklist &Worklist,
9411 Register NewCond) const {
9412
9413 // Ensure that def inst defines SCC, which is still live.
9414 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9415 !Op.isDead() && Op.getParent() == &SCCDefInst);
9416 SmallVector<MachineInstr *, 4> CopyToDelete;
9417 // This assumes that all the users of SCC are in the same block
9418 // as the SCC def.
9419 for (MachineInstr &MI : // Skip the def inst itself.
9420 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9421 SCCDefInst.getParent()->end())) {
9422 // Check if SCC is used first.
9423 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9424 if (SCCIdx != -1) {
9425 if (MI.isCopy()) {
9426 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9427 Register DestReg = MI.getOperand(0).getReg();
9428
9429 MRI.replaceRegWith(DestReg, NewCond);
9430 CopyToDelete.push_back(&MI);
9431 } else {
9432
9433 if (NewCond.isValid())
9434 MI.getOperand(SCCIdx).setReg(NewCond);
9435
9436 Worklist.insert(&MI);
9437 }
9438 }
9439 // Exit if we find another SCC def.
9440 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9441 break;
9442 }
9443 for (auto &Copy : CopyToDelete)
9444 Copy->eraseFromParent();
9445}
9446
9447// Instructions that use SCC may be converted to VALU instructions. When that
9448// happens, the SCC register is changed to VCC_LO. The instruction that defines
9449// SCC must be changed to an instruction that defines VCC. This function makes
9450// sure that the instruction that defines SCC is added to the moveToVALU
9451// worklist.
9452void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9453 SIInstrWorklist &Worklist) const {
9454 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9455 // then there is nothing to do because the defining instruction has been
9456 // converted to a VALU already. If SCC then that instruction needs to be
9457 // converted to a VALU.
9458 for (MachineInstr &MI :
9459 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9460 SCCUseInst->getParent()->rend())) {
9461 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9462 break;
9463 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9464 Worklist.insert(&MI);
9465 break;
9466 }
9467 }
9468}
9469
9470const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9471 const MachineInstr &Inst) const {
9472 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9473
9474 switch (Inst.getOpcode()) {
9475 // For target instructions, getOpRegClass just returns the virtual register
9476 // class associated with the operand, so we need to find an equivalent VGPR
9477 // register class in order to move the instruction to the VALU.
9478 case AMDGPU::COPY:
9479 case AMDGPU::PHI:
9480 case AMDGPU::REG_SEQUENCE:
9481 case AMDGPU::INSERT_SUBREG:
9482 case AMDGPU::WQM:
9483 case AMDGPU::SOFT_WQM:
9484 case AMDGPU::STRICT_WWM:
9485 case AMDGPU::STRICT_WQM: {
9486 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9487 if (RI.isAGPRClass(SrcRC)) {
9488 if (RI.isAGPRClass(NewDstRC))
9489 return nullptr;
9490
9491 switch (Inst.getOpcode()) {
9492 case AMDGPU::PHI:
9493 case AMDGPU::REG_SEQUENCE:
9494 case AMDGPU::INSERT_SUBREG:
9495 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9496 break;
9497 default:
9498 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9499 }
9500
9501 if (!NewDstRC)
9502 return nullptr;
9503 } else {
9504 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9505 return nullptr;
9506
9507 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9508 if (!NewDstRC)
9509 return nullptr;
9510 }
9511
9512 return NewDstRC;
9513 }
9514 default:
9515 return NewDstRC;
9516 }
9517}
9518
9519// Find the one SGPR operand we are allowed to use.
9520Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9521 int OpIndices[3]) const {
9522 const MCInstrDesc &Desc = MI.getDesc();
9523
9524 // Find the one SGPR operand we are allowed to use.
9525 //
9526 // First we need to consider the instruction's operand requirements before
9527 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9528 // of VCC, but we are still bound by the constant bus requirement to only use
9529 // one.
9530 //
9531 // If the operand's class is an SGPR, we can never move it.
9532
9533 Register SGPRReg = findImplicitSGPRRead(MI);
9534 if (SGPRReg)
9535 return SGPRReg;
9536
9537 Register UsedSGPRs[3] = {Register()};
9538 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9539
9540 for (unsigned i = 0; i < 3; ++i) {
9541 int Idx = OpIndices[i];
9542 if (Idx == -1)
9543 break;
9544
9545 const MachineOperand &MO = MI.getOperand(Idx);
9546 if (!MO.isReg())
9547 continue;
9548
9549 // Is this operand statically required to be an SGPR based on the operand
9550 // constraints?
9551 const TargetRegisterClass *OpRC =
9552 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9553 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9554 if (IsRequiredSGPR)
9555 return MO.getReg();
9556
9557 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9558 Register Reg = MO.getReg();
9559 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9560 if (RI.isSGPRClass(RegRC))
9561 UsedSGPRs[i] = Reg;
9562 }
9563
9564 // We don't have a required SGPR operand, so we have a bit more freedom in
9565 // selecting operands to move.
9566
9567 // Try to select the most used SGPR. If an SGPR is equal to one of the
9568 // others, we choose that.
9569 //
9570 // e.g.
9571 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9572 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9573
9574 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9575 // prefer those.
9576
9577 if (UsedSGPRs[0]) {
9578 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9579 SGPRReg = UsedSGPRs[0];
9580 }
9581
9582 if (!SGPRReg && UsedSGPRs[1]) {
9583 if (UsedSGPRs[1] == UsedSGPRs[2])
9584 SGPRReg = UsedSGPRs[1];
9585 }
9586
9587 return SGPRReg;
9588}
9589
9591 AMDGPU::OpName OperandName) const {
9592 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9593 return nullptr;
9594
9595 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9596 if (Idx == -1)
9597 return nullptr;
9598
9599 return &MI.getOperand(Idx);
9600}
9601
9603 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9604 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9607 return (Format << 44) |
9608 (1ULL << 56) | // RESOURCE_LEVEL = 1
9609 (3ULL << 60); // OOB_SELECT = 3
9610 }
9611
9612 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9613 if (ST.isAmdHsaOS()) {
9614 // Set ATC = 1. GFX9 doesn't have this bit.
9615 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9616 RsrcDataFormat |= (1ULL << 56);
9617
9618 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9619 // BTW, it disables TC L2 and therefore decreases performance.
9620 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9621 RsrcDataFormat |= (2ULL << 59);
9622 }
9623
9624 return RsrcDataFormat;
9625}
9626
9630 0xffffffff; // Size;
9631
9632 // GFX9 doesn't have ELEMENT_SIZE.
9633 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9634 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9635 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9636 }
9637
9638 // IndexStride = 64 / 32.
9639 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9640 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9641
9642 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9643 // Clear them unless we want a huge stride.
9644 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9645 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9646 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9647
9648 return Rsrc23;
9649}
9650
9652 unsigned Opc = MI.getOpcode();
9653
9654 return isSMRD(Opc);
9655}
9656
9658 return get(Opc).mayLoad() &&
9659 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9660}
9661
9663 int &FrameIndex) const {
9664 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9665 if (!Addr || !Addr->isFI())
9666 return Register();
9667
9668 assert(!MI.memoperands_empty() &&
9669 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9670
9671 FrameIndex = Addr->getIndex();
9672 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9673}
9674
9676 int &FrameIndex) const {
9677 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9678 assert(Addr && Addr->isFI());
9679 FrameIndex = Addr->getIndex();
9680 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9681}
9682
9684 int &FrameIndex) const {
9685 if (!MI.mayLoad())
9686 return Register();
9687
9688 if (isMUBUF(MI) || isVGPRSpill(MI))
9689 return isStackAccess(MI, FrameIndex);
9690
9691 if (isSGPRSpill(MI))
9692 return isSGPRStackAccess(MI, FrameIndex);
9693
9694 return Register();
9695}
9696
9698 int &FrameIndex) const {
9699 if (!MI.mayStore())
9700 return Register();
9701
9702 if (isMUBUF(MI) || isVGPRSpill(MI))
9703 return isStackAccess(MI, FrameIndex);
9704
9705 if (isSGPRSpill(MI))
9706 return isSGPRStackAccess(MI, FrameIndex);
9707
9708 return Register();
9709}
9710
9712 unsigned Size = 0;
9714 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9715 while (++I != E && I->isInsideBundle()) {
9716 assert(!I->isBundle() && "No nested bundle!");
9718 }
9719
9720 return Size;
9721}
9722
9724 unsigned Opc = MI.getOpcode();
9726 unsigned DescSize = Desc.getSize();
9727
9728 // If we have a definitive size, we can use it. Otherwise we need to inspect
9729 // the operands to know the size.
9730 if (isFixedSize(MI)) {
9731 unsigned Size = DescSize;
9732
9733 // If we hit the buggy offset, an extra nop will be inserted in MC so
9734 // estimate the worst case.
9735 if (MI.isBranch() && ST.hasOffset3fBug())
9736 Size += 4;
9737
9738 return Size;
9739 }
9740
9741 // Instructions may have a 32-bit literal encoded after them. Check
9742 // operands that could ever be literals.
9743 if (isVALU(MI) || isSALU(MI)) {
9744 if (isDPP(MI))
9745 return DescSize;
9746 bool HasLiteral = false;
9747 unsigned LiteralSize = 4;
9748 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9749 const MachineOperand &Op = MI.getOperand(I);
9750 const MCOperandInfo &OpInfo = Desc.operands()[I];
9751 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9752 HasLiteral = true;
9753 if (ST.has64BitLiterals()) {
9754 switch (OpInfo.OperandType) {
9755 default:
9756 break;
9758 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9759 LiteralSize = 8;
9760 break;
9762 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9763 LiteralSize = 8;
9764 break;
9765 }
9766 }
9767 break;
9768 }
9769 }
9770 return HasLiteral ? DescSize + LiteralSize : DescSize;
9771 }
9772
9773 // Check whether we have extra NSA words.
9774 if (isMIMG(MI)) {
9775 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9776 if (VAddr0Idx < 0)
9777 return 8;
9778
9779 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9780 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9781 }
9782
9783 switch (Opc) {
9784 case TargetOpcode::BUNDLE:
9785 return getInstBundleSize(MI);
9786 case TargetOpcode::INLINEASM:
9787 case TargetOpcode::INLINEASM_BR: {
9788 const MachineFunction *MF = MI.getMF();
9789 const char *AsmStr = MI.getOperand(0).getSymbolName();
9790 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9791 }
9792 default:
9793 if (MI.isMetaInstruction())
9794 return 0;
9795
9796 // If D16 Pseudo inst, get correct MC code size
9797 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9798 if (D16Info) {
9799 // Assume d16_lo/hi inst are always in same size
9800 unsigned LoInstOpcode = D16Info->LoOp;
9801 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9802 DescSize = Desc.getSize();
9803 }
9804
9805 // If FMA Pseudo inst, get correct MC code size
9806 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9807 // All potential lowerings are the same size; arbitrarily pick one.
9808 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9809 DescSize = Desc.getSize();
9810 }
9811
9812 return DescSize;
9813 }
9814}
9815
9817 if (!isFLAT(MI))
9818 return false;
9819
9820 if (MI.memoperands_empty())
9821 return true;
9822
9823 for (const MachineMemOperand *MMO : MI.memoperands()) {
9824 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9825 return true;
9826 }
9827 return false;
9828}
9829
9832 static const std::pair<int, const char *> TargetIndices[] = {
9833 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9834 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9835 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9836 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9837 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9838 return ArrayRef(TargetIndices);
9839}
9840
9841/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9842/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9848
9849/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9850/// pass.
9855
9856// Called during:
9857// - pre-RA scheduling and post-RA scheduling
9860 const ScheduleDAGMI *DAG) const {
9861 // Borrowed from Arm Target
9862 // We would like to restrict this hazard recognizer to only
9863 // post-RA scheduling; we can tell that we're post-RA because we don't
9864 // track VRegLiveness.
9865 if (!DAG->hasVRegLiveness())
9866 return new GCNHazardRecognizer(DAG->MF);
9868}
9869
9870std::pair<unsigned, unsigned>
9872 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9873}
9874
9877 static const std::pair<unsigned, const char *> TargetFlags[] = {
9878 {MO_GOTPCREL, "amdgpu-gotprel"},
9879 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9880 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9881 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9882 {MO_REL32_LO, "amdgpu-rel32-lo"},
9883 {MO_REL32_HI, "amdgpu-rel32-hi"},
9884 {MO_REL64, "amdgpu-rel64"},
9885 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9886 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9887 {MO_ABS64, "amdgpu-abs64"},
9888 };
9889
9890 return ArrayRef(TargetFlags);
9891}
9892
9895 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9896 {
9897 {MONoClobber, "amdgpu-noclobber"},
9898 {MOLastUse, "amdgpu-last-use"},
9899 {MOCooperative, "amdgpu-cooperative"},
9900 };
9901
9902 return ArrayRef(TargetFlags);
9903}
9904
9906 const MachineFunction &MF) const {
9908 assert(SrcReg.isVirtual());
9909 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9910 return AMDGPU::WWM_COPY;
9911
9912 return AMDGPU::COPY;
9913}
9914
9916 uint16_t Opcode = MI.getOpcode();
9917 // Check if it is SGPR spill or wwm-register spill Opcode.
9918 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
9919 return true;
9920
9921 const MachineFunction *MF = MI.getMF();
9922 const MachineRegisterInfo &MRI = MF->getRegInfo();
9924
9925 // See if this is Liverange split instruction inserted for SGPR or
9926 // wwm-register. The implicit def inserted for wwm-registers should also be
9927 // included as they can appear at the bb begin.
9928 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
9929 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
9930 return false;
9931
9932 Register Reg = MI.getOperand(0).getReg();
9933 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
9934 return IsLRSplitInst;
9935
9936 return MFI->isWWMReg(Reg);
9937}
9938
9940 Register Reg) const {
9941 // We need to handle instructions which may be inserted during register
9942 // allocation to handle the prolog. The initial prolog instruction may have
9943 // been separated from the start of the block by spills and copies inserted
9944 // needed by the prolog. However, the insertions for scalar registers can
9945 // always be placed at the BB top as they are independent of the exec mask
9946 // value.
9947 bool IsNullOrVectorRegister = true;
9948 if (Reg) {
9949 const MachineFunction *MF = MI.getMF();
9950 const MachineRegisterInfo &MRI = MF->getRegInfo();
9951 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9952 }
9953
9954 return IsNullOrVectorRegister &&
9955 (canAddToBBProlog(MI) ||
9956 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
9957 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9958}
9959
9963 const DebugLoc &DL,
9964 Register DestReg) const {
9965 if (ST.hasAddNoCarryInsts())
9966 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9967
9968 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9969 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9970 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9971
9972 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9973 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9974}
9975
9978 const DebugLoc &DL,
9979 Register DestReg,
9980 RegScavenger &RS) const {
9981 if (ST.hasAddNoCarryInsts())
9982 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9983
9984 // If available, prefer to use vcc.
9985 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9986 ? Register(RI.getVCC())
9987 : RS.scavengeRegisterBackwards(
9988 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9989 0, /* AllowSpill */ false);
9990
9991 // TODO: Users need to deal with this.
9992 if (!UnusedCarry.isValid())
9993 return MachineInstrBuilder();
9994
9995 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9996 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9997}
9998
9999bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10000 switch (Opcode) {
10001 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10002 case AMDGPU::SI_KILL_I1_TERMINATOR:
10003 return true;
10004 default:
10005 return false;
10006 }
10007}
10008
10010 switch (Opcode) {
10011 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10012 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10013 case AMDGPU::SI_KILL_I1_PSEUDO:
10014 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10015 default:
10016 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10017 }
10018}
10019
10020bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10021 return Imm <= getMaxMUBUFImmOffset(ST);
10022}
10023
10025 // GFX12 field is non-negative 24-bit signed byte offset.
10026 const unsigned OffsetBits =
10027 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10028 return (1 << OffsetBits) - 1;
10029}
10030
10032 if (!ST.isWave32())
10033 return;
10034
10035 if (MI.isInlineAsm())
10036 return;
10037
10038 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10039 return;
10040
10041 for (auto &Op : MI.implicit_operands()) {
10042 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10043 Op.setReg(AMDGPU::VCC_LO);
10044 }
10045}
10046
10048 if (!isSMRD(MI))
10049 return false;
10050
10051 // Check that it is using a buffer resource.
10052 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10053 if (Idx == -1) // e.g. s_memtime
10054 return false;
10055
10056 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10057 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10058}
10059
10060// Given Imm, split it into the values to put into the SOffset and ImmOffset
10061// fields in an MUBUF instruction. Return false if it is not possible (due to a
10062// hardware bug needing a workaround).
10063//
10064// The required alignment ensures that individual address components remain
10065// aligned if they are aligned to begin with. It also ensures that additional
10066// offsets within the given alignment can be added to the resulting ImmOffset.
10068 uint32_t &ImmOffset, Align Alignment) const {
10069 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10070 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10071 uint32_t Overflow = 0;
10072
10073 if (Imm > MaxImm) {
10074 if (Imm <= MaxImm + 64) {
10075 // Use an SOffset inline constant for 4..64
10076 Overflow = Imm - MaxImm;
10077 Imm = MaxImm;
10078 } else {
10079 // Try to keep the same value in SOffset for adjacent loads, so that
10080 // the corresponding register contents can be re-used.
10081 //
10082 // Load values with all low-bits (except for alignment bits) set into
10083 // SOffset, so that a larger range of values can be covered using
10084 // s_movk_i32.
10085 //
10086 // Atomic operations fail to work correctly when individual address
10087 // components are unaligned, even if their sum is aligned.
10088 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10089 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10090 Imm = Low;
10091 Overflow = High - Alignment.value();
10092 }
10093 }
10094
10095 if (Overflow > 0) {
10096 // There is a hardware bug in SI and CI which prevents address clamping in
10097 // MUBUF instructions from working correctly with SOffsets. The immediate
10098 // offset is unaffected.
10099 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10100 return false;
10101
10102 // It is not possible to set immediate in SOffset field on some targets.
10103 if (ST.hasRestrictedSOffset())
10104 return false;
10105 }
10106
10107 ImmOffset = Imm;
10108 SOffset = Overflow;
10109 return true;
10110}
10111
10112// Depending on the used address space and instructions, some immediate offsets
10113// are allowed and some are not.
10114// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10115// scratch instruction offsets can also be negative. On GFX12, offsets can be
10116// negative for all variants.
10117//
10118// There are several bugs related to these offsets:
10119// On gfx10.1, flat instructions that go into the global address space cannot
10120// use an offset.
10121//
10122// For scratch instructions, the address can be either an SGPR or a VGPR.
10123// The following offsets can be used, depending on the architecture (x means
10124// cannot be used):
10125// +----------------------------+------+------+
10126// | Address-Mode | SGPR | VGPR |
10127// +----------------------------+------+------+
10128// | gfx9 | | |
10129// | negative, 4-aligned offset | x | ok |
10130// | negative, unaligned offset | x | ok |
10131// +----------------------------+------+------+
10132// | gfx10 | | |
10133// | negative, 4-aligned offset | ok | ok |
10134// | negative, unaligned offset | ok | x |
10135// +----------------------------+------+------+
10136// | gfx10.3 | | |
10137// | negative, 4-aligned offset | ok | ok |
10138// | negative, unaligned offset | ok | ok |
10139// +----------------------------+------+------+
10140//
10141// This function ignores the addressing mode, so if an offset cannot be used in
10142// one addressing mode, it is considered illegal.
10143bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10144 uint64_t FlatVariant) const {
10145 // TODO: Should 0 be special cased?
10146 if (!ST.hasFlatInstOffsets())
10147 return false;
10148
10149 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10150 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10151 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10152 return false;
10153
10154 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10155 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10156 (Offset % 4) != 0) {
10157 return false;
10158 }
10159
10160 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10161 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10162 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10163}
10164
10165// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10166std::pair<int64_t, int64_t>
10167SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10168 uint64_t FlatVariant) const {
10169 int64_t RemainderOffset = COffsetVal;
10170 int64_t ImmField = 0;
10171
10172 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10173 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10174
10175 if (AllowNegative) {
10176 // Use signed division by a power of two to truncate towards 0.
10177 int64_t D = 1LL << NumBits;
10178 RemainderOffset = (COffsetVal / D) * D;
10179 ImmField = COffsetVal - RemainderOffset;
10180
10181 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10182 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10183 (ImmField % 4) != 0) {
10184 // Make ImmField a multiple of 4
10185 RemainderOffset += ImmField % 4;
10186 ImmField -= ImmField % 4;
10187 }
10188 } else if (COffsetVal >= 0) {
10189 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10190 RemainderOffset = COffsetVal - ImmField;
10191 }
10192
10193 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10194 assert(RemainderOffset + ImmField == COffsetVal);
10195 return {ImmField, RemainderOffset};
10196}
10197
10199 if (ST.hasNegativeScratchOffsetBug() &&
10200 FlatVariant == SIInstrFlags::FlatScratch)
10201 return false;
10202
10203 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10204}
10205
10206static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10207 switch (ST.getGeneration()) {
10208 default:
10209 break;
10212 return SIEncodingFamily::SI;
10215 return SIEncodingFamily::VI;
10221 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10225 }
10226 llvm_unreachable("Unknown subtarget generation!");
10227}
10228
10229bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10230 switch(MCOp) {
10231 // These opcodes use indirect register addressing so
10232 // they need special handling by codegen (currently missing).
10233 // Therefore it is too risky to allow these opcodes
10234 // to be selected by dpp combiner or sdwa peepholer.
10235 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10236 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10237 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10238 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10239 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10240 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10241 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10242 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10243 return true;
10244 default:
10245 return false;
10246 }
10247}
10248
10249#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10250 case OPCODE##_dpp: \
10251 case OPCODE##_e32: \
10252 case OPCODE##_e64: \
10253 case OPCODE##_e64_dpp: \
10254 case OPCODE##_sdwa:
10255
10256static bool isRenamedInGFX9(int Opcode) {
10257 switch (Opcode) {
10258 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10259 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10260 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10261 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10262 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10263 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10264 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10265 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10266 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10267 //
10268 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10269 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10270 case AMDGPU::V_FMA_F16_gfx9_e64:
10271 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10272 case AMDGPU::V_INTERP_P2_F16:
10273 case AMDGPU::V_MAD_F16_e64:
10274 case AMDGPU::V_MAD_U16_e64:
10275 case AMDGPU::V_MAD_I16_e64:
10276 return true;
10277 default:
10278 return false;
10279 }
10280}
10281
10282int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10283 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10284 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10285
10286 unsigned Gen = subtargetEncodingFamily(ST);
10287
10288 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10290
10291 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10292 // subtarget has UnpackedD16VMem feature.
10293 // TODO: remove this when we discard GFX80 encoding.
10294 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10296
10297 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10298 switch (ST.getGeneration()) {
10299 default:
10301 break;
10304 break;
10307 break;
10308 }
10309 }
10310
10311 if (isMAI(Opcode)) {
10312 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10313 if (MFMAOp != -1)
10314 Opcode = MFMAOp;
10315 }
10316
10317 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10318
10319 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10321
10322 // -1 means that Opcode is already a native instruction.
10323 if (MCOp == -1)
10324 return Opcode;
10325
10326 if (ST.hasGFX90AInsts()) {
10327 uint16_t NMCOp = (uint16_t)-1;
10328 if (ST.hasGFX940Insts())
10330 if (NMCOp == (uint16_t)-1)
10332 if (NMCOp == (uint16_t)-1)
10334 if (NMCOp != (uint16_t)-1)
10335 MCOp = NMCOp;
10336 }
10337
10338 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10339 // no encoding in the given subtarget generation.
10340 if (MCOp == (uint16_t)-1)
10341 return -1;
10342
10343 if (isAsmOnlyOpcode(MCOp))
10344 return -1;
10345
10346 return MCOp;
10347}
10348
10349static
10351 assert(RegOpnd.isReg());
10352 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10353 getRegSubRegPair(RegOpnd);
10354}
10355
10358 assert(MI.isRegSequence());
10359 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10360 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10361 auto &RegOp = MI.getOperand(1 + 2 * I);
10362 return getRegOrUndef(RegOp);
10363 }
10365}
10366
10367// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10368// Following a subreg of reg:subreg isn't supported
10371 if (!RSR.SubReg)
10372 return false;
10373 switch (MI.getOpcode()) {
10374 default: break;
10375 case AMDGPU::REG_SEQUENCE:
10376 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10377 return true;
10378 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10379 case AMDGPU::INSERT_SUBREG:
10380 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10381 // inserted the subreg we're looking for
10382 RSR = getRegOrUndef(MI.getOperand(2));
10383 else { // the subreg in the rest of the reg
10384 auto R1 = getRegOrUndef(MI.getOperand(1));
10385 if (R1.SubReg) // subreg of subreg isn't supported
10386 return false;
10387 RSR.Reg = R1.Reg;
10388 }
10389 return true;
10390 }
10391 return false;
10392}
10393
10395 const MachineRegisterInfo &MRI) {
10396 assert(MRI.isSSA());
10397 if (!P.Reg.isVirtual())
10398 return nullptr;
10399
10400 auto RSR = P;
10401 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10402 while (auto *MI = DefInst) {
10403 DefInst = nullptr;
10404 switch (MI->getOpcode()) {
10405 case AMDGPU::COPY:
10406 case AMDGPU::V_MOV_B32_e32: {
10407 auto &Op1 = MI->getOperand(1);
10408 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10409 if (Op1.isUndef())
10410 return nullptr;
10411 RSR = getRegSubRegPair(Op1);
10412 DefInst = MRI.getVRegDef(RSR.Reg);
10413 }
10414 break;
10415 }
10416 default:
10417 if (followSubRegDef(*MI, RSR)) {
10418 if (!RSR.Reg)
10419 return nullptr;
10420 DefInst = MRI.getVRegDef(RSR.Reg);
10421 }
10422 }
10423 if (!DefInst)
10424 return MI;
10425 }
10426 return nullptr;
10427}
10428
10430 Register VReg,
10431 const MachineInstr &DefMI,
10432 const MachineInstr &UseMI) {
10433 assert(MRI.isSSA() && "Must be run on SSA");
10434
10435 auto *TRI = MRI.getTargetRegisterInfo();
10436 auto *DefBB = DefMI.getParent();
10437
10438 // Don't bother searching between blocks, although it is possible this block
10439 // doesn't modify exec.
10440 if (UseMI.getParent() != DefBB)
10441 return true;
10442
10443 const int MaxInstScan = 20;
10444 int NumInst = 0;
10445
10446 // Stop scan at the use.
10447 auto E = UseMI.getIterator();
10448 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10449 if (I->isDebugInstr())
10450 continue;
10451
10452 if (++NumInst > MaxInstScan)
10453 return true;
10454
10455 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10456 return true;
10457 }
10458
10459 return false;
10460}
10461
10463 Register VReg,
10464 const MachineInstr &DefMI) {
10465 assert(MRI.isSSA() && "Must be run on SSA");
10466
10467 auto *TRI = MRI.getTargetRegisterInfo();
10468 auto *DefBB = DefMI.getParent();
10469
10470 const int MaxUseScan = 10;
10471 int NumUse = 0;
10472
10473 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10474 auto &UseInst = *Use.getParent();
10475 // Don't bother searching between blocks, although it is possible this block
10476 // doesn't modify exec.
10477 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10478 return true;
10479
10480 if (++NumUse > MaxUseScan)
10481 return true;
10482 }
10483
10484 if (NumUse == 0)
10485 return false;
10486
10487 const int MaxInstScan = 20;
10488 int NumInst = 0;
10489
10490 // Stop scan when we have seen all the uses.
10491 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10492 assert(I != DefBB->end());
10493
10494 if (I->isDebugInstr())
10495 continue;
10496
10497 if (++NumInst > MaxInstScan)
10498 return true;
10499
10500 for (const MachineOperand &Op : I->operands()) {
10501 // We don't check reg masks here as they're used only on calls:
10502 // 1. EXEC is only considered const within one BB
10503 // 2. Call should be a terminator instruction if present in a BB
10504
10505 if (!Op.isReg())
10506 continue;
10507
10508 Register Reg = Op.getReg();
10509 if (Op.isUse()) {
10510 if (Reg == VReg && --NumUse == 0)
10511 return false;
10512 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10513 return true;
10514 }
10515 }
10516}
10517
10520 const DebugLoc &DL, Register Src, Register Dst) const {
10521 auto Cur = MBB.begin();
10522 if (Cur != MBB.end())
10523 do {
10524 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10525 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10526 ++Cur;
10527 } while (Cur != MBB.end() && Cur != LastPHIIt);
10528
10529 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10530 Dst);
10531}
10532
10535 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10536 if (InsPt != MBB.end() &&
10537 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10538 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10539 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10540 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10541 InsPt++;
10542 return BuildMI(MBB, InsPt, DL,
10543 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10544 .addReg(Src, {}, SrcSubReg)
10545 .addReg(AMDGPU::EXEC, RegState::Implicit);
10546 }
10547 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10548 Dst);
10549}
10550
10551bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10552
10555 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10556 VirtRegMap *VRM) const {
10557 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10558 //
10559 // %0:sreg_32 = COPY $m0
10560 //
10561 // We explicitly chose SReg_32 for the virtual register so such a copy might
10562 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10563 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10564 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10565 // TargetInstrInfo::foldMemoryOperand() is going to try.
10566 // A similar issue also exists with spilling and reloading $exec registers.
10567 //
10568 // To prevent that, constrain the %0 register class here.
10569 if (isFullCopyInstr(MI)) {
10570 Register DstReg = MI.getOperand(0).getReg();
10571 Register SrcReg = MI.getOperand(1).getReg();
10572 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10573 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10575 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10576 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10577 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10578 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10579 return nullptr;
10580 }
10581 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10582 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10583 return nullptr;
10584 }
10585 }
10586 }
10587
10588 return nullptr;
10589}
10590
10592 const MachineInstr &MI,
10593 unsigned *PredCost) const {
10594 if (MI.isBundle()) {
10596 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10597 unsigned Lat = 0, Count = 0;
10598 for (++I; I != E && I->isBundledWithPred(); ++I) {
10599 ++Count;
10600 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10601 }
10602 return Lat + Count - 1;
10603 }
10604
10605 return SchedModel.computeInstrLatency(&MI);
10606}
10607
10608const MachineOperand &
10610 if (const MachineOperand *CallAddrOp =
10611 getNamedOperand(MI, AMDGPU::OpName::src0))
10612 return *CallAddrOp;
10614}
10615
10618 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10619 unsigned Opcode = MI.getOpcode();
10620
10621 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10622 Register Dst = MI.getOperand(0).getReg();
10623 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10624 : MI.getOperand(1).getReg();
10625 LLT DstTy = MRI.getType(Dst);
10626 LLT SrcTy = MRI.getType(Src);
10627 unsigned DstAS = DstTy.getAddressSpace();
10628 unsigned SrcAS = SrcTy.getAddressSpace();
10629 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10630 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10631 ST.hasGloballyAddressableScratch()
10634 };
10635
10636 // If the target supports globally addressable scratch, the mapping from
10637 // scratch memory to the flat aperture changes therefore an address space cast
10638 // is no longer uniform.
10639 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10640 return HandleAddrSpaceCast(MI);
10641
10642 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10643 auto IID = GI->getIntrinsicID();
10648
10649 switch (IID) {
10650 case Intrinsic::amdgcn_addrspacecast_nonnull:
10651 return HandleAddrSpaceCast(MI);
10652 case Intrinsic::amdgcn_if:
10653 case Intrinsic::amdgcn_else:
10654 // FIXME: Uniform if second result
10655 break;
10656 }
10657
10659 }
10660
10661 // Loads from the private and flat address spaces are divergent, because
10662 // threads can execute the load instruction with the same inputs and get
10663 // different results.
10664 //
10665 // All other loads are not divergent, because if threads issue loads with the
10666 // same arguments, they will always get the same result.
10667 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10668 Opcode == AMDGPU::G_SEXTLOAD) {
10669 if (MI.memoperands_empty())
10670 return InstructionUniformity::NeverUniform; // conservative assumption
10671
10672 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10673 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10674 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10675 })) {
10676 // At least one MMO in a non-global address space.
10678 }
10680 }
10681
10682 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10683 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10684 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10685 AMDGPU::isGenericAtomic(Opcode)) {
10687 }
10689}
10690
10692 if (!Formatter)
10693 Formatter = std::make_unique<AMDGPUMIRFormatter>(ST);
10694 return Formatter.get();
10695}
10696
10699
10700 if (isNeverUniform(MI))
10702
10703 unsigned opcode = MI.getOpcode();
10704 if (opcode == AMDGPU::V_READLANE_B32 ||
10705 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10706 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10708
10709 if (isCopyInstr(MI)) {
10710 const MachineOperand &srcOp = MI.getOperand(1);
10711 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10712 const TargetRegisterClass *regClass =
10713 RI.getPhysRegBaseClass(srcOp.getReg());
10714 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10716 }
10718 }
10719
10720 // GMIR handling
10721 if (MI.isPreISelOpcode())
10723
10724 // Atomics are divergent because they are executed sequentially: when an
10725 // atomic operation refers to the same address in each thread, then each
10726 // thread after the first sees the value written by the previous thread as
10727 // original value.
10728
10729 if (isAtomic(MI))
10731
10732 // Loads from the private and flat address spaces are divergent, because
10733 // threads can execute the load instruction with the same inputs and get
10734 // different results.
10735 if (isFLAT(MI) && MI.mayLoad()) {
10736 if (MI.memoperands_empty())
10737 return InstructionUniformity::NeverUniform; // conservative assumption
10738
10739 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10740 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10741 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10742 })) {
10743 // At least one MMO in a non-global address space.
10745 }
10746
10748 }
10749
10750 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10751 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10752
10753 // FIXME: It's conceptually broken to report this for an instruction, and not
10754 // a specific def operand. For inline asm in particular, there could be mixed
10755 // uniform and divergent results.
10756 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10757 const MachineOperand &SrcOp = MI.getOperand(I);
10758 if (!SrcOp.isReg())
10759 continue;
10760
10761 Register Reg = SrcOp.getReg();
10762 if (!Reg || !SrcOp.readsReg())
10763 continue;
10764
10765 // If RegBank is null, this is unassigned or an unallocatable special
10766 // register, which are all scalars.
10767 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10768 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10770 }
10771
10772 // TODO: Uniformity check condtions above can be rearranged for more
10773 // redability
10774
10775 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10776 // currently turned into no-op COPYs by SelectionDAG ISel and are
10777 // therefore no longer recognizable.
10778
10780}
10781
10783 switch (MF.getFunction().getCallingConv()) {
10785 return 1;
10787 return 2;
10789 return 3;
10793 const Function &F = MF.getFunction();
10794 F.getContext().diagnose(DiagnosticInfoUnsupported(
10795 F, "ds_ordered_count unsupported for this calling conv"));
10796 [[fallthrough]];
10797 }
10800 case CallingConv::C:
10801 case CallingConv::Fast:
10802 default:
10803 // Assume other calling conventions are various compute callable functions
10804 return 0;
10805 }
10806}
10807
10809 Register &SrcReg2, int64_t &CmpMask,
10810 int64_t &CmpValue) const {
10811 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10812 return false;
10813
10814 switch (MI.getOpcode()) {
10815 default:
10816 break;
10817 case AMDGPU::S_CMP_EQ_U32:
10818 case AMDGPU::S_CMP_EQ_I32:
10819 case AMDGPU::S_CMP_LG_U32:
10820 case AMDGPU::S_CMP_LG_I32:
10821 case AMDGPU::S_CMP_LT_U32:
10822 case AMDGPU::S_CMP_LT_I32:
10823 case AMDGPU::S_CMP_GT_U32:
10824 case AMDGPU::S_CMP_GT_I32:
10825 case AMDGPU::S_CMP_LE_U32:
10826 case AMDGPU::S_CMP_LE_I32:
10827 case AMDGPU::S_CMP_GE_U32:
10828 case AMDGPU::S_CMP_GE_I32:
10829 case AMDGPU::S_CMP_EQ_U64:
10830 case AMDGPU::S_CMP_LG_U64:
10831 SrcReg = MI.getOperand(0).getReg();
10832 if (MI.getOperand(1).isReg()) {
10833 if (MI.getOperand(1).getSubReg())
10834 return false;
10835 SrcReg2 = MI.getOperand(1).getReg();
10836 CmpValue = 0;
10837 } else if (MI.getOperand(1).isImm()) {
10838 SrcReg2 = Register();
10839 CmpValue = MI.getOperand(1).getImm();
10840 } else {
10841 return false;
10842 }
10843 CmpMask = ~0;
10844 return true;
10845 case AMDGPU::S_CMPK_EQ_U32:
10846 case AMDGPU::S_CMPK_EQ_I32:
10847 case AMDGPU::S_CMPK_LG_U32:
10848 case AMDGPU::S_CMPK_LG_I32:
10849 case AMDGPU::S_CMPK_LT_U32:
10850 case AMDGPU::S_CMPK_LT_I32:
10851 case AMDGPU::S_CMPK_GT_U32:
10852 case AMDGPU::S_CMPK_GT_I32:
10853 case AMDGPU::S_CMPK_LE_U32:
10854 case AMDGPU::S_CMPK_LE_I32:
10855 case AMDGPU::S_CMPK_GE_U32:
10856 case AMDGPU::S_CMPK_GE_I32:
10857 SrcReg = MI.getOperand(0).getReg();
10858 SrcReg2 = Register();
10859 CmpValue = MI.getOperand(1).getImm();
10860 CmpMask = ~0;
10861 return true;
10862 }
10863
10864 return false;
10865}
10866
10868 for (MachineBasicBlock *S : MBB->successors()) {
10869 if (S->isLiveIn(AMDGPU::SCC))
10870 return false;
10871 }
10872 return true;
10873}
10874
10875// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
10876// (incoming SCC) = !(SCC defined by SCCDef).
10877// Return true if all uses can be re-written, false otherwise.
10878bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
10879 MachineBasicBlock *MBB = SCCDef->getParent();
10880 SmallVector<MachineInstr *> InvertInstr;
10881 bool SCCIsDead = false;
10882
10883 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
10884 constexpr unsigned ScanLimit = 12;
10885 unsigned Count = 0;
10886 for (MachineInstr &MI :
10887 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
10888 if (++Count > ScanLimit)
10889 return false;
10890 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
10891 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
10892 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
10893 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10894 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
10895 InvertInstr.push_back(&MI);
10896 else
10897 return false;
10898 }
10899 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
10900 SCCIsDead = true;
10901 break;
10902 }
10903 }
10904 if (!SCCIsDead && isSCCDeadOnExit(MBB))
10905 SCCIsDead = true;
10906
10907 // SCC may have more uses. Can't invert all of them.
10908 if (!SCCIsDead)
10909 return false;
10910
10911 // Invert uses
10912 for (MachineInstr *MI : InvertInstr) {
10913 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
10914 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
10915 swapOperands(*MI);
10916 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10917 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
10918 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
10919 ? AMDGPU::S_CBRANCH_SCC1
10920 : AMDGPU::S_CBRANCH_SCC0));
10921 } else {
10922 llvm_unreachable("SCC used but no inversion handling");
10923 }
10924 }
10925 return true;
10926}
10927
10928// SCC is already valid after SCCValid.
10929// SCCRedefine will redefine SCC to the same value already available after
10930// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
10931// update kill/dead flags if necessary.
10932bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
10933 bool NeedInversion) const {
10934 MachineInstr *KillsSCC = nullptr;
10935 if (SCCValid->getParent() != SCCRedefine->getParent())
10936 return false;
10937 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
10938 SCCRedefine->getIterator())) {
10939 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
10940 return false;
10941 if (MI.killsRegister(AMDGPU::SCC, &RI))
10942 KillsSCC = &MI;
10943 }
10944 if (NeedInversion && !invertSCCUse(SCCRedefine))
10945 return false;
10946 if (MachineOperand *SccDef =
10947 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
10948 SccDef->setIsDead(false);
10949 if (KillsSCC)
10950 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
10951 SCCRedefine->eraseFromParent();
10952 return true;
10953}
10954
10955static bool foldableSelect(const MachineInstr &Def) {
10956 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
10957 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
10958 return false;
10959 bool Op1IsNonZeroImm =
10960 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
10961 bool Op2IsZeroImm =
10962 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
10963 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
10964 return false;
10965 return true;
10966}
10967
10968static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
10969 unsigned &NewDefOpc) {
10970 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
10971 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
10972 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
10973 Def.getOpcode() != AMDGPU::S_ADD_U32)
10974 return false;
10975 const MachineOperand &AddSrc1 = Def.getOperand(1);
10976 const MachineOperand &AddSrc2 = Def.getOperand(2);
10977 int64_t addend;
10978
10979 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
10980 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
10981 (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
10982 (!getFoldableImm(&AddSrc2, addend) || addend != 1))
10983 return false;
10984
10985 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
10986 const MachineOperand *SccDef =
10987 Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10988 if (!SccDef->isDead())
10989 return false;
10990 NewDefOpc = AMDGPU::S_ADD_U32;
10991 }
10992 NeedInversion = !NeedInversion;
10993 return true;
10994}
10995
10997 Register SrcReg2, int64_t CmpMask,
10998 int64_t CmpValue,
10999 const MachineRegisterInfo *MRI) const {
11000 if (!SrcReg || SrcReg.isPhysical())
11001 return false;
11002
11003 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
11004 return false;
11005
11006 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11007 this](bool NeedInversion) -> bool {
11008 if (CmpValue != 0)
11009 return false;
11010
11011 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11012 if (!Def)
11013 return false;
11014
11015 // For S_OP that set SCC = DST!=0, do the transformation
11016 //
11017 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11018 //
11019 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11020 // do the transformation:
11021 //
11022 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11023 //
11024 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11025 // for S_CSELECT* already has the same value that will be calculated by
11026 // s_cmp_lg_*
11027 //
11028 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11029 // (non-zero imm), 0)
11030
11031 unsigned NewDefOpc = Def->getOpcode();
11032 if (!setsSCCIfResultIsNonZero(*Def) &&
11033 !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
11034 !foldableSelect(*Def))
11035 return false;
11036
11037 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
11038 return false;
11039
11040 if (NewDefOpc != Def->getOpcode())
11041 Def->setDesc(get(NewDefOpc));
11042
11043 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11044 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11045 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11046 // sX = s_cselect_b64 (non-zero imm), 0
11047 // sLo = copy sX.sub0
11048 // sHi = copy sX.sub1
11049 // sY = s_or_b32 sLo, sHi
11050 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11051 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11052 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11053 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11054 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11055 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11056 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11057 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11058 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11059 Def2->getOperand(1).isReg() &&
11060 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11061 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11062 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11063 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11064 if (Select && foldableSelect(*Select))
11065 optimizeSCC(Select, Def, /*NeedInversion=*/false);
11066 }
11067 }
11068 }
11069 return true;
11070 };
11071
11072 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11073 this](int64_t ExpectedValue, unsigned SrcSize,
11074 bool IsReversible, bool IsSigned) -> bool {
11075 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11076 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11077 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11078 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11079 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11080 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11081 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11082 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11083 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11084 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11085 //
11086 // Signed ge/gt are not used for the sign bit.
11087 //
11088 // If result of the AND is unused except in the compare:
11089 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11090 //
11091 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11092 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11093 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11094 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11095 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11096 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11097
11098 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11099 if (!Def)
11100 return false;
11101
11102 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11103 Def->getOpcode() != AMDGPU::S_AND_B64)
11104 return false;
11105
11106 int64_t Mask;
11107 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11108 if (MO->isImm())
11109 Mask = MO->getImm();
11110 else if (!getFoldableImm(MO, Mask))
11111 return false;
11112 Mask &= maxUIntN(SrcSize);
11113 return isPowerOf2_64(Mask);
11114 };
11115
11116 MachineOperand *SrcOp = &Def->getOperand(1);
11117 if (isMask(SrcOp))
11118 SrcOp = &Def->getOperand(2);
11119 else if (isMask(&Def->getOperand(2)))
11120 SrcOp = &Def->getOperand(1);
11121 else
11122 return false;
11123
11124 // A valid Mask is required to have a single bit set, hence a non-zero and
11125 // power-of-two value. This verifies that we will not do 64-bit shift below.
11126 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11127 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11128 if (IsSigned && BitNo == SrcSize - 1)
11129 return false;
11130
11131 ExpectedValue <<= BitNo;
11132
11133 bool IsReversedCC = false;
11134 if (CmpValue != ExpectedValue) {
11135 if (!IsReversible)
11136 return false;
11137 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11138 if (!IsReversedCC)
11139 return false;
11140 }
11141
11142 Register DefReg = Def->getOperand(0).getReg();
11143 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11144 return false;
11145
11146 if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
11147 return false;
11148
11149 if (!MRI->use_nodbg_empty(DefReg)) {
11150 assert(!IsReversedCC);
11151 return true;
11152 }
11153
11154 // Replace AND with unused result with a S_BITCMP.
11155 MachineBasicBlock *MBB = Def->getParent();
11156
11157 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11158 : AMDGPU::S_BITCMP1_B32
11159 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11160 : AMDGPU::S_BITCMP1_B64;
11161
11162 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11163 .add(*SrcOp)
11164 .addImm(BitNo);
11165 Def->eraseFromParent();
11166
11167 return true;
11168 };
11169
11170 switch (CmpInstr.getOpcode()) {
11171 default:
11172 break;
11173 case AMDGPU::S_CMP_EQ_U32:
11174 case AMDGPU::S_CMP_EQ_I32:
11175 case AMDGPU::S_CMPK_EQ_U32:
11176 case AMDGPU::S_CMPK_EQ_I32:
11177 return optimizeCmpAnd(1, 32, true, false) ||
11178 optimizeCmpSelect(/*NeedInversion=*/true);
11179 case AMDGPU::S_CMP_GE_U32:
11180 case AMDGPU::S_CMPK_GE_U32:
11181 return optimizeCmpAnd(1, 32, false, false);
11182 case AMDGPU::S_CMP_GE_I32:
11183 case AMDGPU::S_CMPK_GE_I32:
11184 return optimizeCmpAnd(1, 32, false, true);
11185 case AMDGPU::S_CMP_EQ_U64:
11186 return optimizeCmpAnd(1, 64, true, false);
11187 case AMDGPU::S_CMP_LG_U32:
11188 case AMDGPU::S_CMP_LG_I32:
11189 case AMDGPU::S_CMPK_LG_U32:
11190 case AMDGPU::S_CMPK_LG_I32:
11191 return optimizeCmpAnd(0, 32, true, false) ||
11192 optimizeCmpSelect(/*NeedInversion=*/false);
11193 case AMDGPU::S_CMP_GT_U32:
11194 case AMDGPU::S_CMPK_GT_U32:
11195 return optimizeCmpAnd(0, 32, false, false);
11196 case AMDGPU::S_CMP_GT_I32:
11197 case AMDGPU::S_CMPK_GT_I32:
11198 return optimizeCmpAnd(0, 32, false, true);
11199 case AMDGPU::S_CMP_LG_U64:
11200 return optimizeCmpAnd(0, 64, true, false) ||
11201 optimizeCmpSelect(/*NeedInversion=*/false);
11202 }
11203
11204 return false;
11205}
11206
11208 AMDGPU::OpName OpName) const {
11209 if (!ST.needsAlignedVGPRs())
11210 return;
11211
11212 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11213 if (OpNo < 0)
11214 return;
11215 MachineOperand &Op = MI.getOperand(OpNo);
11216 if (getOpSize(MI, OpNo) > 4)
11217 return;
11218
11219 // Add implicit aligned super-reg to force alignment on the data operand.
11220 const DebugLoc &DL = MI.getDebugLoc();
11221 MachineBasicBlock *BB = MI.getParent();
11223 Register DataReg = Op.getReg();
11224 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11225 Register Undef = MRI.createVirtualRegister(
11226 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11227 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11228 Register NewVR =
11229 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11230 : &AMDGPU::VReg_64_Align2RegClass);
11231 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11232 .addReg(DataReg, {}, Op.getSubReg())
11233 .addImm(AMDGPU::sub0)
11234 .addReg(Undef)
11235 .addImm(AMDGPU::sub1);
11236 Op.setReg(NewVR);
11237 Op.setSubReg(AMDGPU::sub0);
11238 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11239}
11240
11242 if (isIGLP(*MI))
11243 return false;
11244
11246}
11247
11249 if (!isWMMA(MI) && !isSWMMAC(MI))
11250 return false;
11251
11252 if (ST.hasGFX1250Insts())
11253 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11254
11255 return true;
11256}
11257
11259 unsigned Opcode = MI.getOpcode();
11260
11261 if (AMDGPU::isGFX12Plus(ST))
11262 return isDOT(MI) || isXDLWMMA(MI);
11263
11264 if (!isMAI(MI) || isDGEMM(Opcode) ||
11265 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11266 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11267 return false;
11268
11269 if (!ST.hasGFX940Insts())
11270 return true;
11271
11272 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11273}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion, unsigned &NewDefOpc)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:144
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
MIRFormater - Interface to format MIR operand based on target.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool setsSCCIfResultIsNonZero(const MachineInstr &MI)
const MIRFormatter * getMIRFormatter() const override
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const final
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const override
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
std::optional< int64_t > getImmOrMaterializedImm(MachineOperand &Op) const
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:233
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:203
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:210
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:226
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:207
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:209
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_V2FP16_SPLAT
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:220
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:218
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:213
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:206
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:227
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:239
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:214
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:250
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:223
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:225
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:244
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:215
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:240
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:222
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:204
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:230
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:587
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:589
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:586
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:588
@ TI_CONSTDATA_START
Definition AMDGPU.h:585
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr RegState getUndefRegState(bool B)
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.