LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
67 AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
86 AMDGPU::OpName OpName) {
87 unsigned Opc0 = N0->getMachineOpcode();
88 unsigned Opc1 = N1->getMachineOpcode();
89
90 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
91 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
92
93 if (Op0Idx == -1 && Op1Idx == -1)
94 return true;
95
96
97 if ((Op0Idx == -1 && Op1Idx != -1) ||
98 (Op1Idx == -1 && Op0Idx != -1))
99 return false;
100
101 // getNamedOperandIdx returns the index for the MachineInstr's operands,
102 // which includes the result as the first operand. We are indexing into the
103 // MachineSDNode's operands, so we need to skip the result operand to get
104 // the real index.
105 --Op0Idx;
106 --Op1Idx;
107
108 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
109}
110
111static bool canRemat(const MachineInstr &MI) {
112
116 return true;
117
118 if (SIInstrInfo::isSMRD(MI)) {
119 return !MI.memoperands_empty() &&
120 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
121 return MMO->isLoad() && MMO->isInvariant();
122 });
123 }
124
125 return false;
126}
127
129 const MachineInstr &MI) const {
130
131 if (canRemat(MI)) {
132 // Normally VALU use of exec would block the rematerialization, but that
133 // is OK in this case to have an implicit exec read as all VALU do.
134 // We really want all of the generic logic for this except for this.
135
136 // Another potential implicit use is mode register. The core logic of
137 // the RA will not attempt rematerialization if mode is set anywhere
138 // in the function, otherwise it is safe since mode is not changed.
139
140 // There is difference to generic method which does not allow
141 // rematerialization if there are virtual register uses. We allow this,
142 // therefore this method includes SOP instructions as well.
143 if (!MI.hasImplicitDef() &&
144 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
145 !MI.mayRaiseFPException())
146 return true;
147 }
148
150}
151
152// Returns true if the scalar result of a VALU instruction depends on exec.
153bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
154 // Ignore comparisons which are only used masked with exec.
155 // This allows some hoisting/sinking of VALU comparisons.
156 if (MI.isCompare()) {
157 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
158 if (!Dst)
159 return true;
160
161 Register DstReg = Dst->getReg();
162 if (!DstReg.isVirtual())
163 return true;
164
165 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
166 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
167 switch (Use.getOpcode()) {
168 case AMDGPU::S_AND_SAVEEXEC_B32:
169 case AMDGPU::S_AND_SAVEEXEC_B64:
170 break;
171 case AMDGPU::S_AND_B32:
172 case AMDGPU::S_AND_B64:
173 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
174 return true;
175 break;
176 default:
177 return true;
178 }
179 }
180 return false;
181 }
182
183 switch (MI.getOpcode()) {
184 default:
185 break;
186 case AMDGPU::V_READFIRSTLANE_B32:
187 return true;
188 }
189
190 return false;
191}
192
194 // Any implicit use of exec by VALU is not a real register read.
195 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
196 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
197}
198
200 MachineBasicBlock *SuccToSinkTo,
201 MachineCycleInfo *CI) const {
202 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
203 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
204 return true;
205
206 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
207 // Check if sinking of MI would create temporal divergent use.
208 for (auto Op : MI.uses()) {
209 if (Op.isReg() && Op.getReg().isVirtual() &&
210 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
211 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
212
213 // SgprDef defined inside cycle
214 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
215 if (FromCycle == nullptr)
216 continue;
217
218 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
219 // Check if there is a FromCycle that contains SgprDef's basic block but
220 // does not contain SuccToSinkTo and also has divergent exit condition.
221 while (FromCycle && !FromCycle->contains(ToCycle)) {
223 FromCycle->getExitingBlocks(ExitingBlocks);
224
225 // FromCycle has divergent exit condition.
226 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
227 if (hasDivergentBranch(ExitingBlock))
228 return false;
229 }
230
231 FromCycle = FromCycle->getParentCycle();
232 }
233 }
234 }
235
236 return true;
237}
238
240 int64_t &Offset0,
241 int64_t &Offset1) const {
242 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
243 return false;
244
245 unsigned Opc0 = Load0->getMachineOpcode();
246 unsigned Opc1 = Load1->getMachineOpcode();
247
248 // Make sure both are actually loads.
249 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
250 return false;
251
252 // A mayLoad instruction without a def is not a load. Likely a prefetch.
253 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
254 return false;
255
256 if (isDS(Opc0) && isDS(Opc1)) {
257
258 // FIXME: Handle this case:
259 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
260 return false;
261
262 // Check base reg.
263 if (Load0->getOperand(0) != Load1->getOperand(0))
264 return false;
265
266 // Skip read2 / write2 variants for simplicity.
267 // TODO: We should report true if the used offsets are adjacent (excluded
268 // st64 versions).
269 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
270 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
271 if (Offset0Idx == -1 || Offset1Idx == -1)
272 return false;
273
274 // XXX - be careful of dataless loads
275 // getNamedOperandIdx returns the index for MachineInstrs. Since they
276 // include the output in the operand list, but SDNodes don't, we need to
277 // subtract the index by one.
278 Offset0Idx -= get(Opc0).NumDefs;
279 Offset1Idx -= get(Opc1).NumDefs;
280 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
281 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
282 return true;
283 }
284
285 if (isSMRD(Opc0) && isSMRD(Opc1)) {
286 // Skip time and cache invalidation instructions.
287 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
288 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
289 return false;
290
291 unsigned NumOps = getNumOperandsNoGlue(Load0);
292 if (NumOps != getNumOperandsNoGlue(Load1))
293 return false;
294
295 // Check base reg.
296 if (Load0->getOperand(0) != Load1->getOperand(0))
297 return false;
298
299 // Match register offsets, if both register and immediate offsets present.
300 assert(NumOps == 4 || NumOps == 5);
301 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
302 return false;
303
304 const ConstantSDNode *Load0Offset =
306 const ConstantSDNode *Load1Offset =
308
309 if (!Load0Offset || !Load1Offset)
310 return false;
311
312 Offset0 = Load0Offset->getZExtValue();
313 Offset1 = Load1Offset->getZExtValue();
314 return true;
315 }
316
317 // MUBUF and MTBUF can access the same addresses.
318 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
319
320 // MUBUF and MTBUF have vaddr at different indices.
321 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
323 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
324 return false;
325
326 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
327 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
328
329 if (OffIdx0 == -1 || OffIdx1 == -1)
330 return false;
331
332 // getNamedOperandIdx returns the index for MachineInstrs. Since they
333 // include the output in the operand list, but SDNodes don't, we need to
334 // subtract the index by one.
335 OffIdx0 -= get(Opc0).NumDefs;
336 OffIdx1 -= get(Opc1).NumDefs;
337
338 SDValue Off0 = Load0->getOperand(OffIdx0);
339 SDValue Off1 = Load1->getOperand(OffIdx1);
340
341 // The offset might be a FrameIndexSDNode.
342 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
343 return false;
344
345 Offset0 = Off0->getAsZExtVal();
346 Offset1 = Off1->getAsZExtVal();
347 return true;
348 }
349
350 return false;
351}
352
353static bool isStride64(unsigned Opc) {
354 switch (Opc) {
355 case AMDGPU::DS_READ2ST64_B32:
356 case AMDGPU::DS_READ2ST64_B64:
357 case AMDGPU::DS_WRITE2ST64_B32:
358 case AMDGPU::DS_WRITE2ST64_B64:
359 return true;
360 default:
361 return false;
362 }
363}
364
367 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
368 const TargetRegisterInfo *TRI) const {
369 if (!LdSt.mayLoadOrStore())
370 return false;
371
372 unsigned Opc = LdSt.getOpcode();
373 OffsetIsScalable = false;
374 const MachineOperand *BaseOp, *OffsetOp;
375 int DataOpIdx;
376
377 if (isDS(LdSt)) {
378 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
379 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
380 if (OffsetOp) {
381 // Normal, single offset LDS instruction.
382 if (!BaseOp) {
383 // DS_CONSUME/DS_APPEND use M0 for the base address.
384 // TODO: find the implicit use operand for M0 and use that as BaseOp?
385 return false;
386 }
387 BaseOps.push_back(BaseOp);
388 Offset = OffsetOp->getImm();
389 // Get appropriate operand, and compute width accordingly.
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
391 if (DataOpIdx == -1)
392 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
393 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
394 Width = LocationSize::precise(64);
395 else
396 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
397 } else {
398 // The 2 offset instructions use offset0 and offset1 instead. We can treat
399 // these as a load with a single offset if the 2 offsets are consecutive.
400 // We will use this for some partially aligned loads.
401 const MachineOperand *Offset0Op =
402 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
403 const MachineOperand *Offset1Op =
404 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
405
406 unsigned Offset0 = Offset0Op->getImm() & 0xff;
407 unsigned Offset1 = Offset1Op->getImm() & 0xff;
408 if (Offset0 + 1 != Offset1)
409 return false;
410
411 // Each of these offsets is in element sized units, so we need to convert
412 // to bytes of the individual reads.
413
414 unsigned EltSize;
415 if (LdSt.mayLoad())
416 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
417 else {
418 assert(LdSt.mayStore());
419 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
421 }
422
423 if (isStride64(Opc))
424 EltSize *= 64;
425
426 BaseOps.push_back(BaseOp);
427 Offset = EltSize * Offset0;
428 // Get appropriate operand(s), and compute width accordingly.
429 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
430 if (DataOpIdx == -1) {
431 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
432 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
433 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
434 Width = LocationSize::precise(
435 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
436 } else {
437 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
438 }
439 }
440 return true;
441 }
442
443 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
444 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
445 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
446 return false;
447 BaseOps.push_back(RSrc);
448 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
449 if (BaseOp && !BaseOp->isFI())
450 BaseOps.push_back(BaseOp);
451 const MachineOperand *OffsetImm =
452 getNamedOperand(LdSt, AMDGPU::OpName::offset);
453 Offset = OffsetImm->getImm();
454 const MachineOperand *SOffset =
455 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
456 if (SOffset) {
457 if (SOffset->isReg())
458 BaseOps.push_back(SOffset);
459 else
460 Offset += SOffset->getImm();
461 }
462 // Get appropriate operand, and compute width accordingly.
463 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
464 if (DataOpIdx == -1)
465 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
466 if (DataOpIdx == -1) // LDS DMA
467 return false;
468 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
469 return true;
470 }
471
472 if (isImage(LdSt)) {
473 auto RsrcOpName =
474 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
475 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
476 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
477 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
478 if (VAddr0Idx >= 0) {
479 // GFX10 possible NSA encoding.
480 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
481 BaseOps.push_back(&LdSt.getOperand(I));
482 } else {
483 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
484 }
485 Offset = 0;
486 // Get appropriate operand, and compute width accordingly.
487 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
488 if (DataOpIdx == -1)
489 return false; // no return sampler
490 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
491 return true;
492 }
493
494 if (isSMRD(LdSt)) {
495 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
496 if (!BaseOp) // e.g. S_MEMTIME
497 return false;
498 BaseOps.push_back(BaseOp);
499 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
500 Offset = OffsetOp ? OffsetOp->getImm() : 0;
501 // Get appropriate operand, and compute width accordingly.
502 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
503 if (DataOpIdx == -1)
504 return false;
505 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
506 return true;
507 }
508
509 if (isFLAT(LdSt)) {
510 // Instructions have either vaddr or saddr or both or none.
511 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
512 if (BaseOp)
513 BaseOps.push_back(BaseOp);
514 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
515 if (BaseOp)
516 BaseOps.push_back(BaseOp);
517 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
518 // Get appropriate operand, and compute width accordingly.
519 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
520 if (DataOpIdx == -1)
521 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
522 if (DataOpIdx == -1) // LDS DMA
523 return false;
524 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
525 return true;
526 }
527
528 return false;
529}
530
531static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
533 const MachineInstr &MI2,
535 // Only examine the first "base" operand of each instruction, on the
536 // assumption that it represents the real base address of the memory access.
537 // Other operands are typically offsets or indices from this base address.
538 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
539 return true;
540
541 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
542 return false;
543
544 auto *MO1 = *MI1.memoperands_begin();
545 auto *MO2 = *MI2.memoperands_begin();
546 if (MO1->getAddrSpace() != MO2->getAddrSpace())
547 return false;
548
549 const auto *Base1 = MO1->getValue();
550 const auto *Base2 = MO2->getValue();
551 if (!Base1 || !Base2)
552 return false;
553 Base1 = getUnderlyingObject(Base1);
554 Base2 = getUnderlyingObject(Base2);
555
556 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
557 return false;
558
559 return Base1 == Base2;
560}
561
563 int64_t Offset1, bool OffsetIsScalable1,
565 int64_t Offset2, bool OffsetIsScalable2,
566 unsigned ClusterSize,
567 unsigned NumBytes) const {
568 // If the mem ops (to be clustered) do not have the same base ptr, then they
569 // should not be clustered
570 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
571 if (!BaseOps1.empty() && !BaseOps2.empty()) {
572 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
573 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
574 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
575 return false;
576
577 const SIMachineFunctionInfo *MFI =
578 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
579 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
580 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
581 // If only one base op is empty, they do not have the same base ptr
582 return false;
583 }
584
585 // In order to avoid register pressure, on an average, the number of DWORDS
586 // loaded together by all clustered mem ops should not exceed
587 // MaxMemoryClusterDWords. This is an empirical value based on certain
588 // observations and performance related experiments.
589 // The good thing about this heuristic is - it avoids clustering of too many
590 // sub-word loads, and also avoids clustering of wide loads. Below is the
591 // brief summary of how the heuristic behaves for various `LoadSize` when
592 // MaxMemoryClusterDWords is 8.
593 //
594 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
595 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
596 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
597 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
598 // (5) LoadSize >= 17: do not cluster
599 const unsigned LoadSize = NumBytes / ClusterSize;
600 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
601 return NumDWords <= MaxMemoryClusterDWords;
602}
603
604// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
605// the first 16 loads will be interleaved with the stores, and the next 16 will
606// be clustered as expected. It should really split into 2 16 store batches.
607//
608// Loads are clustered until this returns false, rather than trying to schedule
609// groups of stores. This also means we have to deal with saying different
610// address space loads should be clustered, and ones which might cause bank
611// conflicts.
612//
613// This might be deprecated so it might not be worth that much effort to fix.
615 int64_t Offset0, int64_t Offset1,
616 unsigned NumLoads) const {
617 assert(Offset1 > Offset0 &&
618 "Second offset should be larger than first offset!");
619 // If we have less than 16 loads in a row, and the offsets are within 64
620 // bytes, then schedule together.
621
622 // A cacheline is 64 bytes (for global memory).
623 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
624}
625
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 const char *Msg = "illegal VGPR to SGPR copy") {
631 MachineFunction *MF = MBB.getParent();
632
634 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
635
636 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
637 .addReg(SrcReg, getKillRegState(KillSrc));
638}
639
640/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
641/// possible to have a direct copy in these cases on GFX908, so an intermediate
642/// VGPR copy is required.
646 const DebugLoc &DL, MCRegister DestReg,
647 MCRegister SrcReg, bool KillSrc,
648 RegScavenger &RS, bool RegsOverlap,
649 Register ImpDefSuperReg = Register(),
650 Register ImpUseSuperReg = Register()) {
651 assert((TII.getSubtarget().hasMAIInsts() &&
652 !TII.getSubtarget().hasGFX90AInsts()) &&
653 "Expected GFX908 subtarget.");
654
655 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
656 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
657 "Source register of the copy should be either an SGPR or an AGPR.");
658
659 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
660 "Destination register of the copy should be an AGPR.");
661
662 const SIRegisterInfo &RI = TII.getRegisterInfo();
663
664 // First try to find defining accvgpr_write to avoid temporary registers.
665 // In the case of copies of overlapping AGPRs, we conservatively do not
666 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
667 // an accvgpr_write used for this same copy due to implicit-defs
668 if (!RegsOverlap) {
669 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
670 --Def;
671
672 if (!Def->modifiesRegister(SrcReg, &RI))
673 continue;
674
675 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
676 Def->getOperand(0).getReg() != SrcReg)
677 break;
678
679 MachineOperand &DefOp = Def->getOperand(1);
680 assert(DefOp.isReg() || DefOp.isImm());
681
682 if (DefOp.isReg()) {
683 bool SafeToPropagate = true;
684 // Check that register source operand is not clobbered before MI.
685 // Immediate operands are always safe to propagate.
686 for (auto I = Def; I != MI && SafeToPropagate; ++I)
687 if (I->modifiesRegister(DefOp.getReg(), &RI))
688 SafeToPropagate = false;
689
690 if (!SafeToPropagate)
691 break;
692
693 for (auto I = Def; I != MI; ++I)
694 I->clearRegisterKills(DefOp.getReg(), &RI);
695 }
696
697 MachineInstrBuilder Builder =
698 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
699 .add(DefOp);
700 if (ImpDefSuperReg)
701 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
702
703 if (ImpUseSuperReg) {
704 Builder.addReg(ImpUseSuperReg,
706 }
707
708 return;
709 }
710 }
711
712 RS.enterBasicBlockEnd(MBB);
713 RS.backward(std::next(MI));
714
715 // Ideally we want to have three registers for a long reg_sequence copy
716 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
717 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
718 *MBB.getParent());
719
720 // Registers in the sequence are allocated contiguously so we can just
721 // use register number to pick one of three round-robin temps.
722 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
723 Register Tmp =
724 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
725 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
726 "VGPR used for an intermediate copy should have been reserved.");
727
728 // Only loop through if there are any free registers left. We don't want to
729 // spill.
730 while (RegNo--) {
731 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
732 /* RestoreAfter */ false, 0,
733 /* AllowSpill */ false);
734 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
735 break;
736 Tmp = Tmp2;
737 RS.setRegUsed(Tmp);
738 }
739
740 // Insert copy to temporary VGPR.
741 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
742 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
743 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
744 } else {
745 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
746 }
747
748 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
749 .addReg(SrcReg, getKillRegState(KillSrc));
750 if (ImpUseSuperReg) {
751 UseBuilder.addReg(ImpUseSuperReg,
753 }
754
755 MachineInstrBuilder DefBuilder
756 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
757 .addReg(Tmp, RegState::Kill);
758
759 if (ImpDefSuperReg)
760 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
761}
762
765 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
766 const TargetRegisterClass *RC, bool Forward) {
767 const SIRegisterInfo &RI = TII.getRegisterInfo();
768 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
770 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
771
772 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
773 int16_t SubIdx = BaseIndices[Idx];
774 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
775 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
776 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
777 unsigned Opcode = AMDGPU::S_MOV_B32;
778
779 // Is SGPR aligned? If so try to combine with next.
780 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
781 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
782 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
783 // Can use SGPR64 copy
784 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
785 SubIdx = RI.getSubRegFromChannel(Channel, 2);
786 DestSubReg = RI.getSubReg(DestReg, SubIdx);
787 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
788 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
789 Opcode = AMDGPU::S_MOV_B64;
790 Idx++;
791 }
792
793 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
794 .addReg(SrcSubReg)
795 .addReg(SrcReg, RegState::Implicit);
796
797 if (!FirstMI)
798 FirstMI = LastMI;
799
800 if (!Forward)
801 I--;
802 }
803
804 assert(FirstMI && LastMI);
805 if (!Forward)
806 std::swap(FirstMI, LastMI);
807
808 FirstMI->addOperand(
809 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
810
811 if (KillSrc)
812 LastMI->addRegisterKilled(SrcReg, &RI);
813}
814
817 const DebugLoc &DL, Register DestReg,
818 Register SrcReg, bool KillSrc, bool RenamableDest,
819 bool RenamableSrc) const {
820 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
821 unsigned Size = RI.getRegSizeInBits(*RC);
822 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
823 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
824
825 // The rest of copyPhysReg assumes Src and Dst size are the same size.
826 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
827 // we remove Fix16BitCopies and this code block?
828 if (Fix16BitCopies) {
829 if (((Size == 16) != (SrcSize == 16))) {
830 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
831 assert(ST.useRealTrue16Insts());
832 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
833 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
834 RegToFix = SubReg;
835
836 if (DestReg == SrcReg) {
837 // Identity copy. Insert empty bundle since ExpandPostRA expects an
838 // instruction here.
839 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
840 return;
841 }
842 RC = RI.getPhysRegBaseClass(DestReg);
843 Size = RI.getRegSizeInBits(*RC);
844 SrcRC = RI.getPhysRegBaseClass(SrcReg);
845 SrcSize = RI.getRegSizeInBits(*SrcRC);
846 }
847 }
848
849 if (RC == &AMDGPU::VGPR_32RegClass) {
850 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
851 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
852 AMDGPU::AGPR_32RegClass.contains(SrcReg));
853 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
854 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
855 BuildMI(MBB, MI, DL, get(Opc), DestReg)
856 .addReg(SrcReg, getKillRegState(KillSrc));
857 return;
858 }
859
860 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
861 RC == &AMDGPU::SReg_32RegClass) {
862 if (SrcReg == AMDGPU::SCC) {
863 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
864 .addImm(1)
865 .addImm(0);
866 return;
867 }
868
869 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
870 if (DestReg == AMDGPU::VCC_LO) {
871 // FIXME: Hack until VReg_1 removed.
872 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
873 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
874 .addImm(0)
875 .addReg(SrcReg, getKillRegState(KillSrc));
876 return;
877 }
878
879 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
880 return;
881 }
882
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 return;
886 }
887
888 if (RC == &AMDGPU::SReg_64RegClass) {
889 if (SrcReg == AMDGPU::SCC) {
890 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
891 .addImm(1)
892 .addImm(0);
893 return;
894 }
895
896 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
897 if (DestReg == AMDGPU::VCC) {
898 // FIXME: Hack until VReg_1 removed.
899 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
900 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
901 .addImm(0)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
904 }
905
906 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
907 return;
908 }
909
910 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
911 .addReg(SrcReg, getKillRegState(KillSrc));
912 return;
913 }
914
915 if (DestReg == AMDGPU::SCC) {
916 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
917 // but SelectionDAG emits such copies for i1 sources.
918 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
919 // This copy can only be produced by patterns
920 // with explicit SCC, which are known to be enabled
921 // only for subtargets with S_CMP_LG_U64 present.
922 assert(ST.hasScalarCompareEq64());
923 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
924 .addReg(SrcReg, getKillRegState(KillSrc))
925 .addImm(0);
926 } else {
927 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
928 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
929 .addReg(SrcReg, getKillRegState(KillSrc))
930 .addImm(0);
931 }
932
933 return;
934 }
935
936 if (RC == &AMDGPU::AGPR_32RegClass) {
937 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
938 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
939 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
940 .addReg(SrcReg, getKillRegState(KillSrc));
941 return;
942 }
943
944 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
945 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
946 .addReg(SrcReg, getKillRegState(KillSrc));
947 return;
948 }
949
950 // FIXME: Pass should maintain scavenger to avoid scan through the block on
951 // every AGPR spill.
952 RegScavenger RS;
953 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
954 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
955 return;
956 }
957
958 if (Size == 16) {
959 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
960 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
961 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
962
963 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
964 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
965 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
966 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
967 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
968 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
969 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
970 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
971
972 if (IsSGPRDst) {
973 if (!IsSGPRSrc) {
974 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
975 return;
976 }
977
978 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
979 .addReg(NewSrcReg, getKillRegState(KillSrc));
980 return;
981 }
982
983 if (IsAGPRDst || IsAGPRSrc) {
984 if (!DstLow || !SrcLow) {
985 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
986 "Cannot use hi16 subreg with an AGPR!");
987 }
988
989 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
990 return;
991 }
992
993 if (ST.useRealTrue16Insts()) {
994 if (IsSGPRSrc) {
995 assert(SrcLow);
996 SrcReg = NewSrcReg;
997 }
998 // Use the smaller instruction encoding if possible.
999 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1000 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1001 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1002 .addReg(SrcReg);
1003 } else {
1004 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1005 .addImm(0) // src0_modifiers
1006 .addReg(SrcReg)
1007 .addImm(0); // op_sel
1008 }
1009 return;
1010 }
1011
1012 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1013 if (!DstLow || !SrcLow) {
1014 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1015 "Cannot use hi16 subreg on VI!");
1016 }
1017
1018 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1019 .addReg(NewSrcReg, getKillRegState(KillSrc));
1020 return;
1021 }
1022
1023 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1024 .addImm(0) // src0_modifiers
1025 .addReg(NewSrcReg)
1026 .addImm(0) // clamp
1033 // First implicit operand is $exec.
1034 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1035 return;
1036 }
1037
1038 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1039 if (ST.hasMovB64()) {
1040 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1041 .addReg(SrcReg, getKillRegState(KillSrc));
1042 return;
1043 }
1044 if (ST.hasPkMovB32()) {
1045 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1047 .addReg(SrcReg)
1049 .addReg(SrcReg)
1050 .addImm(0) // op_sel_lo
1051 .addImm(0) // op_sel_hi
1052 .addImm(0) // neg_lo
1053 .addImm(0) // neg_hi
1054 .addImm(0) // clamp
1055 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1056 return;
1057 }
1058 }
1059
1060 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1061 if (RI.isSGPRClass(RC)) {
1062 if (!RI.isSGPRClass(SrcRC)) {
1063 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1064 return;
1065 }
1066 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1067 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1068 Forward);
1069 return;
1070 }
1071
1072 unsigned EltSize = 4;
1073 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1074 if (RI.isAGPRClass(RC)) {
1075 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1076 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1077 else if (RI.hasVGPRs(SrcRC) ||
1078 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1079 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1080 else
1081 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1082 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1083 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1084 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1085 (RI.isProperlyAlignedRC(*RC) &&
1086 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1087 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1088 if (ST.hasMovB64()) {
1089 Opcode = AMDGPU::V_MOV_B64_e32;
1090 EltSize = 8;
1091 } else if (ST.hasPkMovB32()) {
1092 Opcode = AMDGPU::V_PK_MOV_B32;
1093 EltSize = 8;
1094 }
1095 }
1096
1097 // For the cases where we need an intermediate instruction/temporary register
1098 // (destination is an AGPR), we need a scavenger.
1099 //
1100 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1101 // whole block for every handled copy.
1102 std::unique_ptr<RegScavenger> RS;
1103 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1104 RS = std::make_unique<RegScavenger>();
1105
1106 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1107
1108 // If there is an overlap, we can't kill the super-register on the last
1109 // instruction, since it will also kill the components made live by this def.
1110 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1111 const bool CanKillSuperReg = KillSrc && !Overlap;
1112
1113 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1114 unsigned SubIdx;
1115 if (Forward)
1116 SubIdx = SubIndices[Idx];
1117 else
1118 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1119 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1120 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1121 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1122
1123 bool IsFirstSubreg = Idx == 0;
1124 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1125
1126 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1127 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1128 Register ImpUseSuper = SrcReg;
1129 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1130 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1131 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1133 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1135 .addReg(SrcSubReg)
1137 .addReg(SrcSubReg)
1138 .addImm(0) // op_sel_lo
1139 .addImm(0) // op_sel_hi
1140 .addImm(0) // neg_lo
1141 .addImm(0) // neg_hi
1142 .addImm(0) // clamp
1143 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144 if (IsFirstSubreg)
1146 } else {
1147 MachineInstrBuilder Builder =
1148 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1149 if (IsFirstSubreg)
1150 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1151
1152 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1153 }
1154 }
1155}
1156
1157int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1158 int NewOpc;
1159
1160 // Try to map original to commuted opcode
1161 NewOpc = AMDGPU::getCommuteRev(Opcode);
1162 if (NewOpc != -1)
1163 // Check if the commuted (REV) opcode exists on the target.
1164 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1165
1166 // Try to map commuted to original opcode
1167 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1168 if (NewOpc != -1)
1169 // Check if the original (non-REV) opcode exists on the target.
1170 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1171
1172 return Opcode;
1173}
1174
1175const TargetRegisterClass *
1177 return &AMDGPU::VGPR_32RegClass;
1178}
1179
1182 const DebugLoc &DL, Register DstReg,
1184 Register TrueReg,
1185 Register FalseReg) const {
1186 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1187 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1189 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1190 "Not a VGPR32 reg");
1191
1192 if (Cond.size() == 1) {
1193 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1194 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1195 .add(Cond[0]);
1196 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1197 .addImm(0)
1198 .addReg(FalseReg)
1199 .addImm(0)
1200 .addReg(TrueReg)
1201 .addReg(SReg);
1202 } else if (Cond.size() == 2) {
1203 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1204 switch (Cond[0].getImm()) {
1205 case SIInstrInfo::SCC_TRUE: {
1206 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1207 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1208 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1209 .addImm(0)
1210 .addReg(FalseReg)
1211 .addImm(0)
1212 .addReg(TrueReg)
1213 .addReg(SReg);
1214 break;
1215 }
1216 case SIInstrInfo::SCC_FALSE: {
1217 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1218 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1219 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1220 .addImm(0)
1221 .addReg(FalseReg)
1222 .addImm(0)
1223 .addReg(TrueReg)
1224 .addReg(SReg);
1225 break;
1226 }
1227 case SIInstrInfo::VCCNZ: {
1228 MachineOperand RegOp = Cond[1];
1229 RegOp.setImplicit(false);
1230 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1231 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1232 .add(RegOp);
1233 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1234 .addImm(0)
1235 .addReg(FalseReg)
1236 .addImm(0)
1237 .addReg(TrueReg)
1238 .addReg(SReg);
1239 break;
1240 }
1241 case SIInstrInfo::VCCZ: {
1242 MachineOperand RegOp = Cond[1];
1243 RegOp.setImplicit(false);
1244 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1245 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1246 .add(RegOp);
1247 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1248 .addImm(0)
1249 .addReg(TrueReg)
1250 .addImm(0)
1251 .addReg(FalseReg)
1252 .addReg(SReg);
1253 break;
1254 }
1255 case SIInstrInfo::EXECNZ: {
1256 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1257 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1258 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1259 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1260 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1261 .addImm(0)
1262 .addReg(FalseReg)
1263 .addImm(0)
1264 .addReg(TrueReg)
1265 .addReg(SReg);
1266 break;
1267 }
1268 case SIInstrInfo::EXECZ: {
1269 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1270 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1271 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1272 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1273 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1274 .addImm(0)
1275 .addReg(FalseReg)
1276 .addImm(0)
1277 .addReg(TrueReg)
1278 .addReg(SReg);
1279 llvm_unreachable("Unhandled branch predicate EXECZ");
1280 break;
1281 }
1282 default:
1283 llvm_unreachable("invalid branch predicate");
1284 }
1285 } else {
1286 llvm_unreachable("Can only handle Cond size 1 or 2");
1287 }
1288}
1289
1292 const DebugLoc &DL,
1293 Register SrcReg, int Value) const {
1294 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1295 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1296 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1297 .addImm(Value)
1298 .addReg(SrcReg);
1299
1300 return Reg;
1301}
1302
1305 const DebugLoc &DL,
1306 Register SrcReg, int Value) const {
1307 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1308 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1309 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1310 .addImm(Value)
1311 .addReg(SrcReg);
1312
1313 return Reg;
1314}
1315
1317 const Register Reg,
1318 int64_t &ImmVal) const {
1319 switch (MI.getOpcode()) {
1320 case AMDGPU::V_MOV_B32_e32:
1321 case AMDGPU::S_MOV_B32:
1322 case AMDGPU::S_MOVK_I32:
1323 case AMDGPU::S_MOV_B64:
1324 case AMDGPU::V_MOV_B64_e32:
1325 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1326 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1327 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1328 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1329 case AMDGPU::V_MOV_B64_PSEUDO:
1330 case AMDGPU::V_MOV_B16_t16_e32: {
1331 const MachineOperand &Src0 = MI.getOperand(1);
1332 if (Src0.isImm()) {
1333 ImmVal = Src0.getImm();
1334 return MI.getOperand(0).getReg() == Reg;
1335 }
1336
1337 return false;
1338 }
1339 case AMDGPU::V_MOV_B16_t16_e64: {
1340 const MachineOperand &Src0 = MI.getOperand(2);
1341 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1342 ImmVal = Src0.getImm();
1343 return MI.getOperand(0).getReg() == Reg;
1344 }
1345
1346 return false;
1347 }
1348 case AMDGPU::S_BREV_B32:
1349 case AMDGPU::V_BFREV_B32_e32:
1350 case AMDGPU::V_BFREV_B32_e64: {
1351 const MachineOperand &Src0 = MI.getOperand(1);
1352 if (Src0.isImm()) {
1353 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1354 return MI.getOperand(0).getReg() == Reg;
1355 }
1356
1357 return false;
1358 }
1359 case AMDGPU::S_NOT_B32:
1360 case AMDGPU::V_NOT_B32_e32:
1361 case AMDGPU::V_NOT_B32_e64: {
1362 const MachineOperand &Src0 = MI.getOperand(1);
1363 if (Src0.isImm()) {
1364 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1365 return MI.getOperand(0).getReg() == Reg;
1366 }
1367
1368 return false;
1369 }
1370 default:
1371 return false;
1372 }
1373}
1374
1376
1377 if (RI.isAGPRClass(DstRC))
1378 return AMDGPU::COPY;
1379 if (RI.getRegSizeInBits(*DstRC) == 16) {
1380 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1381 // before RA.
1382 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1383 }
1384 if (RI.getRegSizeInBits(*DstRC) == 32)
1385 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1386 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1387 return AMDGPU::S_MOV_B64;
1388 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1389 return AMDGPU::V_MOV_B64_PSEUDO;
1390 return AMDGPU::COPY;
1391}
1392
1393const MCInstrDesc &
1395 bool IsIndirectSrc) const {
1396 if (IsIndirectSrc) {
1397 if (VecSize <= 32) // 4 bytes
1398 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1399 if (VecSize <= 64) // 8 bytes
1400 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1401 if (VecSize <= 96) // 12 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1403 if (VecSize <= 128) // 16 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1405 if (VecSize <= 160) // 20 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1407 if (VecSize <= 192) // 24 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1409 if (VecSize <= 224) // 28 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1411 if (VecSize <= 256) // 32 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1413 if (VecSize <= 288) // 36 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1415 if (VecSize <= 320) // 40 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1417 if (VecSize <= 352) // 44 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1419 if (VecSize <= 384) // 48 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1421 if (VecSize <= 512) // 64 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1423 if (VecSize <= 1024) // 128 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1425
1426 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1427 }
1428
1429 if (VecSize <= 32) // 4 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1431 if (VecSize <= 64) // 8 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1433 if (VecSize <= 96) // 12 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1435 if (VecSize <= 128) // 16 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1437 if (VecSize <= 160) // 20 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1439 if (VecSize <= 192) // 24 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1441 if (VecSize <= 224) // 28 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1443 if (VecSize <= 256) // 32 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1445 if (VecSize <= 288) // 36 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1447 if (VecSize <= 320) // 40 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1449 if (VecSize <= 352) // 44 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1451 if (VecSize <= 384) // 48 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1453 if (VecSize <= 512) // 64 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1455 if (VecSize <= 1024) // 128 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1457
1458 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1459}
1460
1461static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1462 if (VecSize <= 32) // 4 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1464 if (VecSize <= 64) // 8 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1466 if (VecSize <= 96) // 12 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1468 if (VecSize <= 128) // 16 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1470 if (VecSize <= 160) // 20 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1472 if (VecSize <= 192) // 24 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1474 if (VecSize <= 224) // 28 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1476 if (VecSize <= 256) // 32 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1478 if (VecSize <= 288) // 36 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1480 if (VecSize <= 320) // 40 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1482 if (VecSize <= 352) // 44 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1484 if (VecSize <= 384) // 48 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1486 if (VecSize <= 512) // 64 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1488 if (VecSize <= 1024) // 128 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1490
1491 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1492}
1493
1494static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1495 if (VecSize <= 32) // 4 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1497 if (VecSize <= 64) // 8 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1499 if (VecSize <= 96) // 12 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1501 if (VecSize <= 128) // 16 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1503 if (VecSize <= 160) // 20 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1505 if (VecSize <= 192) // 24 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1507 if (VecSize <= 224) // 28 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1509 if (VecSize <= 256) // 32 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1511 if (VecSize <= 288) // 36 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1513 if (VecSize <= 320) // 40 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1515 if (VecSize <= 352) // 44 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1517 if (VecSize <= 384) // 48 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1519 if (VecSize <= 512) // 64 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1521 if (VecSize <= 1024) // 128 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1523
1524 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1525}
1526
1527static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1528 if (VecSize <= 64) // 8 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1530 if (VecSize <= 128) // 16 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1532 if (VecSize <= 256) // 32 bytes
1533 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1534 if (VecSize <= 512) // 64 bytes
1535 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1536 if (VecSize <= 1024) // 128 bytes
1537 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1538
1539 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1540}
1541
1542const MCInstrDesc &
1543SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1544 bool IsSGPR) const {
1545 if (IsSGPR) {
1546 switch (EltSize) {
1547 case 32:
1548 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1549 case 64:
1550 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1551 default:
1552 llvm_unreachable("invalid reg indexing elt size");
1553 }
1554 }
1555
1556 assert(EltSize == 32 && "invalid reg indexing elt size");
1558}
1559
1560static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1561 switch (Size) {
1562 case 4:
1563 return AMDGPU::SI_SPILL_S32_SAVE;
1564 case 8:
1565 return AMDGPU::SI_SPILL_S64_SAVE;
1566 case 12:
1567 return AMDGPU::SI_SPILL_S96_SAVE;
1568 case 16:
1569 return AMDGPU::SI_SPILL_S128_SAVE;
1570 case 20:
1571 return AMDGPU::SI_SPILL_S160_SAVE;
1572 case 24:
1573 return AMDGPU::SI_SPILL_S192_SAVE;
1574 case 28:
1575 return AMDGPU::SI_SPILL_S224_SAVE;
1576 case 32:
1577 return AMDGPU::SI_SPILL_S256_SAVE;
1578 case 36:
1579 return AMDGPU::SI_SPILL_S288_SAVE;
1580 case 40:
1581 return AMDGPU::SI_SPILL_S320_SAVE;
1582 case 44:
1583 return AMDGPU::SI_SPILL_S352_SAVE;
1584 case 48:
1585 return AMDGPU::SI_SPILL_S384_SAVE;
1586 case 64:
1587 return AMDGPU::SI_SPILL_S512_SAVE;
1588 case 128:
1589 return AMDGPU::SI_SPILL_S1024_SAVE;
1590 default:
1591 llvm_unreachable("unknown register size");
1592 }
1593}
1594
1595static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1596 switch (Size) {
1597 case 2:
1598 return AMDGPU::SI_SPILL_V16_SAVE;
1599 case 4:
1600 return AMDGPU::SI_SPILL_V32_SAVE;
1601 case 8:
1602 return AMDGPU::SI_SPILL_V64_SAVE;
1603 case 12:
1604 return AMDGPU::SI_SPILL_V96_SAVE;
1605 case 16:
1606 return AMDGPU::SI_SPILL_V128_SAVE;
1607 case 20:
1608 return AMDGPU::SI_SPILL_V160_SAVE;
1609 case 24:
1610 return AMDGPU::SI_SPILL_V192_SAVE;
1611 case 28:
1612 return AMDGPU::SI_SPILL_V224_SAVE;
1613 case 32:
1614 return AMDGPU::SI_SPILL_V256_SAVE;
1615 case 36:
1616 return AMDGPU::SI_SPILL_V288_SAVE;
1617 case 40:
1618 return AMDGPU::SI_SPILL_V320_SAVE;
1619 case 44:
1620 return AMDGPU::SI_SPILL_V352_SAVE;
1621 case 48:
1622 return AMDGPU::SI_SPILL_V384_SAVE;
1623 case 64:
1624 return AMDGPU::SI_SPILL_V512_SAVE;
1625 case 128:
1626 return AMDGPU::SI_SPILL_V1024_SAVE;
1627 default:
1628 llvm_unreachable("unknown register size");
1629 }
1630}
1631
1632static unsigned getAVSpillSaveOpcode(unsigned Size) {
1633 switch (Size) {
1634 case 4:
1635 return AMDGPU::SI_SPILL_AV32_SAVE;
1636 case 8:
1637 return AMDGPU::SI_SPILL_AV64_SAVE;
1638 case 12:
1639 return AMDGPU::SI_SPILL_AV96_SAVE;
1640 case 16:
1641 return AMDGPU::SI_SPILL_AV128_SAVE;
1642 case 20:
1643 return AMDGPU::SI_SPILL_AV160_SAVE;
1644 case 24:
1645 return AMDGPU::SI_SPILL_AV192_SAVE;
1646 case 28:
1647 return AMDGPU::SI_SPILL_AV224_SAVE;
1648 case 32:
1649 return AMDGPU::SI_SPILL_AV256_SAVE;
1650 case 36:
1651 return AMDGPU::SI_SPILL_AV288_SAVE;
1652 case 40:
1653 return AMDGPU::SI_SPILL_AV320_SAVE;
1654 case 44:
1655 return AMDGPU::SI_SPILL_AV352_SAVE;
1656 case 48:
1657 return AMDGPU::SI_SPILL_AV384_SAVE;
1658 case 64:
1659 return AMDGPU::SI_SPILL_AV512_SAVE;
1660 case 128:
1661 return AMDGPU::SI_SPILL_AV1024_SAVE;
1662 default:
1663 llvm_unreachable("unknown register size");
1664 }
1665}
1666
1667static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1668 bool IsVectorSuperClass) {
1669 // Currently, there is only 32-bit WWM register spills needed.
1670 if (Size != 4)
1671 llvm_unreachable("unknown wwm register spill size");
1672
1673 if (IsVectorSuperClass)
1674 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1675
1676 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1677}
1678
1680 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1681 const SIMachineFunctionInfo &MFI) const {
1682 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1683
1684 // Choose the right opcode if spilling a WWM register.
1686 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1687
1688 // TODO: Check if AGPRs are available
1689 if (ST.hasMAIInsts())
1690 return getAVSpillSaveOpcode(Size);
1691
1693}
1694
1697 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1698 MachineInstr::MIFlag Flags) const {
1699 MachineFunction *MF = MBB.getParent();
1701 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1702 const DebugLoc &DL = MBB.findDebugLoc(MI);
1703
1704 MachinePointerInfo PtrInfo
1705 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1707 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1708 FrameInfo.getObjectAlign(FrameIndex));
1709 unsigned SpillSize = RI.getSpillSize(*RC);
1710
1712 if (RI.isSGPRClass(RC)) {
1713 MFI->setHasSpilledSGPRs();
1714 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1715 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1716 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1717
1718 // We are only allowed to create one new instruction when spilling
1719 // registers, so we need to use pseudo instruction for spilling SGPRs.
1720 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1721
1722 // The SGPR spill/restore instructions only work on number sgprs, so we need
1723 // to make sure we are using the correct register class.
1724 if (SrcReg.isVirtual() && SpillSize == 4) {
1725 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1726 }
1727
1728 BuildMI(MBB, MI, DL, OpDesc)
1729 .addReg(SrcReg, getKillRegState(isKill)) // data
1730 .addFrameIndex(FrameIndex) // addr
1731 .addMemOperand(MMO)
1733
1734 if (RI.spillSGPRToVGPR())
1735 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1736 return;
1737 }
1738
1739 unsigned Opcode =
1740 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1741 MFI->setHasSpilledVGPRs();
1742
1743 BuildMI(MBB, MI, DL, get(Opcode))
1744 .addReg(SrcReg, getKillRegState(isKill)) // data
1745 .addFrameIndex(FrameIndex) // addr
1746 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1747 .addImm(0) // offset
1748 .addMemOperand(MMO);
1749}
1750
1751static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1752 switch (Size) {
1753 case 4:
1754 return AMDGPU::SI_SPILL_S32_RESTORE;
1755 case 8:
1756 return AMDGPU::SI_SPILL_S64_RESTORE;
1757 case 12:
1758 return AMDGPU::SI_SPILL_S96_RESTORE;
1759 case 16:
1760 return AMDGPU::SI_SPILL_S128_RESTORE;
1761 case 20:
1762 return AMDGPU::SI_SPILL_S160_RESTORE;
1763 case 24:
1764 return AMDGPU::SI_SPILL_S192_RESTORE;
1765 case 28:
1766 return AMDGPU::SI_SPILL_S224_RESTORE;
1767 case 32:
1768 return AMDGPU::SI_SPILL_S256_RESTORE;
1769 case 36:
1770 return AMDGPU::SI_SPILL_S288_RESTORE;
1771 case 40:
1772 return AMDGPU::SI_SPILL_S320_RESTORE;
1773 case 44:
1774 return AMDGPU::SI_SPILL_S352_RESTORE;
1775 case 48:
1776 return AMDGPU::SI_SPILL_S384_RESTORE;
1777 case 64:
1778 return AMDGPU::SI_SPILL_S512_RESTORE;
1779 case 128:
1780 return AMDGPU::SI_SPILL_S1024_RESTORE;
1781 default:
1782 llvm_unreachable("unknown register size");
1783 }
1784}
1785
1786static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1787 switch (Size) {
1788 case 2:
1789 return AMDGPU::SI_SPILL_V16_RESTORE;
1790 case 4:
1791 return AMDGPU::SI_SPILL_V32_RESTORE;
1792 case 8:
1793 return AMDGPU::SI_SPILL_V64_RESTORE;
1794 case 12:
1795 return AMDGPU::SI_SPILL_V96_RESTORE;
1796 case 16:
1797 return AMDGPU::SI_SPILL_V128_RESTORE;
1798 case 20:
1799 return AMDGPU::SI_SPILL_V160_RESTORE;
1800 case 24:
1801 return AMDGPU::SI_SPILL_V192_RESTORE;
1802 case 28:
1803 return AMDGPU::SI_SPILL_V224_RESTORE;
1804 case 32:
1805 return AMDGPU::SI_SPILL_V256_RESTORE;
1806 case 36:
1807 return AMDGPU::SI_SPILL_V288_RESTORE;
1808 case 40:
1809 return AMDGPU::SI_SPILL_V320_RESTORE;
1810 case 44:
1811 return AMDGPU::SI_SPILL_V352_RESTORE;
1812 case 48:
1813 return AMDGPU::SI_SPILL_V384_RESTORE;
1814 case 64:
1815 return AMDGPU::SI_SPILL_V512_RESTORE;
1816 case 128:
1817 return AMDGPU::SI_SPILL_V1024_RESTORE;
1818 default:
1819 llvm_unreachable("unknown register size");
1820 }
1821}
1822
1823static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1824 switch (Size) {
1825 case 4:
1826 return AMDGPU::SI_SPILL_AV32_RESTORE;
1827 case 8:
1828 return AMDGPU::SI_SPILL_AV64_RESTORE;
1829 case 12:
1830 return AMDGPU::SI_SPILL_AV96_RESTORE;
1831 case 16:
1832 return AMDGPU::SI_SPILL_AV128_RESTORE;
1833 case 20:
1834 return AMDGPU::SI_SPILL_AV160_RESTORE;
1835 case 24:
1836 return AMDGPU::SI_SPILL_AV192_RESTORE;
1837 case 28:
1838 return AMDGPU::SI_SPILL_AV224_RESTORE;
1839 case 32:
1840 return AMDGPU::SI_SPILL_AV256_RESTORE;
1841 case 36:
1842 return AMDGPU::SI_SPILL_AV288_RESTORE;
1843 case 40:
1844 return AMDGPU::SI_SPILL_AV320_RESTORE;
1845 case 44:
1846 return AMDGPU::SI_SPILL_AV352_RESTORE;
1847 case 48:
1848 return AMDGPU::SI_SPILL_AV384_RESTORE;
1849 case 64:
1850 return AMDGPU::SI_SPILL_AV512_RESTORE;
1851 case 128:
1852 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1853 default:
1854 llvm_unreachable("unknown register size");
1855 }
1856}
1857
1858static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1859 bool IsVectorSuperClass) {
1860 // Currently, there is only 32-bit WWM register spills needed.
1861 if (Size != 4)
1862 llvm_unreachable("unknown wwm register spill size");
1863
1864 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1865 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1866
1867 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1868}
1869
1871 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1872 const SIMachineFunctionInfo &MFI) const {
1873 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1874
1875 // Choose the right opcode if restoring a WWM register.
1877 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1878
1879 // TODO: Check if AGPRs are available
1880 if (ST.hasMAIInsts())
1882
1883 assert(!RI.isAGPRClass(RC));
1885}
1886
1889 Register DestReg, int FrameIndex,
1890 const TargetRegisterClass *RC,
1891 Register VReg, unsigned SubReg,
1892 MachineInstr::MIFlag Flags) const {
1893 MachineFunction *MF = MBB.getParent();
1895 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1896 const DebugLoc &DL = MBB.findDebugLoc(MI);
1897 unsigned SpillSize = RI.getSpillSize(*RC);
1898
1899 MachinePointerInfo PtrInfo
1900 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1901
1903 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1904 FrameInfo.getObjectAlign(FrameIndex));
1905
1906 if (RI.isSGPRClass(RC)) {
1907 MFI->setHasSpilledSGPRs();
1908 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1909 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1910 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1911
1912 // FIXME: Maybe this should not include a memoperand because it will be
1913 // lowered to non-memory instructions.
1914 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1915 if (DestReg.isVirtual() && SpillSize == 4) {
1917 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1918 }
1919
1920 if (RI.spillSGPRToVGPR())
1921 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1922 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1923 .addFrameIndex(FrameIndex) // addr
1924 .addMemOperand(MMO)
1926
1927 return;
1928 }
1929
1930 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1931 SpillSize, *MFI);
1932 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1933 .addFrameIndex(FrameIndex) // vaddr
1934 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1935 .addImm(0) // offset
1936 .addMemOperand(MMO);
1937}
1938
1943
1946 unsigned Quantity) const {
1947 DebugLoc DL = MBB.findDebugLoc(MI);
1948 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1949 while (Quantity > 0) {
1950 unsigned Arg = std::min(Quantity, MaxSNopCount);
1951 Quantity -= Arg;
1952 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1953 }
1954}
1955
1957 auto *MF = MBB.getParent();
1958 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1959
1960 assert(Info->isEntryFunction());
1961
1962 if (MBB.succ_empty()) {
1963 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1964 if (HasNoTerminator) {
1965 if (Info->returnsVoid()) {
1966 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1967 } else {
1968 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1969 }
1970 }
1971 }
1972}
1973
1977 const DebugLoc &DL) const {
1978 MachineFunction *MF = MBB.getParent();
1979 constexpr unsigned DoorbellIDMask = 0x3ff;
1980 constexpr unsigned ECQueueWaveAbort = 0x400;
1981
1982 MachineBasicBlock *TrapBB = &MBB;
1983 MachineBasicBlock *ContBB = &MBB;
1984 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1985
1986 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1987 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1988 TrapBB = MF->CreateMachineBasicBlock();
1989 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1990 MF->push_back(TrapBB);
1991 MBB.addSuccessor(TrapBB);
1992 } else {
1993 // Since we're adding HaltLoopBB and modifying the CFG, we must return a
1994 // different block to signal the change.
1995 ContBB = HaltLoopBB;
1996 }
1997
1998 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1999 // will be a nop.
2000 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2001 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2002 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2003 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2004 DoorbellReg)
2006 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2007 .addUse(AMDGPU::M0);
2008 Register DoorbellRegMasked =
2009 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2010 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2011 .addUse(DoorbellReg)
2012 .addImm(DoorbellIDMask);
2013 Register SetWaveAbortBit =
2014 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2015 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2016 .addUse(DoorbellRegMasked)
2017 .addImm(ECQueueWaveAbort);
2018 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2019 .addUse(SetWaveAbortBit);
2020 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2022 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2023 .addUse(AMDGPU::TTMP2);
2024 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2025 TrapBB->addSuccessor(HaltLoopBB);
2026
2027 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2028 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2029 .addMBB(HaltLoopBB);
2030 MF->push_back(HaltLoopBB);
2031 HaltLoopBB->addSuccessor(HaltLoopBB);
2032
2033 return ContBB;
2034}
2035
2037 switch (MI.getOpcode()) {
2038 default:
2039 if (MI.isMetaInstruction())
2040 return 0;
2041 return 1; // FIXME: Do wait states equal cycles?
2042
2043 case AMDGPU::S_NOP:
2044 return MI.getOperand(0).getImm() + 1;
2045 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2046 // hazard, even if one exist, won't really be visible. Should we handle it?
2047 }
2048}
2049
2051 MachineBasicBlock &MBB = *MI.getParent();
2052 DebugLoc DL = MBB.findDebugLoc(MI);
2054 switch (MI.getOpcode()) {
2055 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2056 case AMDGPU::S_MOV_B64_term:
2057 // This is only a terminator to get the correct spill code placement during
2058 // register allocation.
2059 MI.setDesc(get(AMDGPU::S_MOV_B64));
2060 break;
2061
2062 case AMDGPU::S_MOV_B32_term:
2063 // This is only a terminator to get the correct spill code placement during
2064 // register allocation.
2065 MI.setDesc(get(AMDGPU::S_MOV_B32));
2066 break;
2067
2068 case AMDGPU::S_XOR_B64_term:
2069 // This is only a terminator to get the correct spill code placement during
2070 // register allocation.
2071 MI.setDesc(get(AMDGPU::S_XOR_B64));
2072 break;
2073
2074 case AMDGPU::S_XOR_B32_term:
2075 // This is only a terminator to get the correct spill code placement during
2076 // register allocation.
2077 MI.setDesc(get(AMDGPU::S_XOR_B32));
2078 break;
2079 case AMDGPU::S_OR_B64_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_OR_B64));
2083 break;
2084 case AMDGPU::S_OR_B32_term:
2085 // This is only a terminator to get the correct spill code placement during
2086 // register allocation.
2087 MI.setDesc(get(AMDGPU::S_OR_B32));
2088 break;
2089
2090 case AMDGPU::S_ANDN2_B64_term:
2091 // This is only a terminator to get the correct spill code placement during
2092 // register allocation.
2093 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2094 break;
2095
2096 case AMDGPU::S_ANDN2_B32_term:
2097 // This is only a terminator to get the correct spill code placement during
2098 // register allocation.
2099 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2100 break;
2101
2102 case AMDGPU::S_AND_B64_term:
2103 // This is only a terminator to get the correct spill code placement during
2104 // register allocation.
2105 MI.setDesc(get(AMDGPU::S_AND_B64));
2106 break;
2107
2108 case AMDGPU::S_AND_B32_term:
2109 // This is only a terminator to get the correct spill code placement during
2110 // register allocation.
2111 MI.setDesc(get(AMDGPU::S_AND_B32));
2112 break;
2113
2114 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2115 // This is only a terminator to get the correct spill code placement during
2116 // register allocation.
2117 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2118 break;
2119
2120 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2121 // This is only a terminator to get the correct spill code placement during
2122 // register allocation.
2123 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2124 break;
2125
2126 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2127 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2128 break;
2129
2130 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2131 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2132 break;
2133 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2134 Register Dst = MI.getOperand(0).getReg();
2135 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2136 MI.setDesc(
2137 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2138 break;
2139 }
2140 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2141 Register Dst = MI.getOperand(0).getReg();
2142 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2143 int64_t Imm = MI.getOperand(1).getImm();
2144
2145 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2146 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2147 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2150 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2151 .addImm(SignExtend64<32>(Imm >> 32))
2153 MI.eraseFromParent();
2154 break;
2155 }
2156
2157 [[fallthrough]];
2158 }
2159 case AMDGPU::V_MOV_B64_PSEUDO: {
2160 Register Dst = MI.getOperand(0).getReg();
2161 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2162 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2163
2164 const MachineOperand &SrcOp = MI.getOperand(1);
2165 // FIXME: Will this work for 64-bit floating point immediates?
2166 assert(!SrcOp.isFPImm());
2167 if (ST.hasMovB64()) {
2168 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2169 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2170 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2171 break;
2172 }
2173 if (SrcOp.isImm()) {
2174 APInt Imm(64, SrcOp.getImm());
2175 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2176 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2177 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2178 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2180 .addImm(Lo.getSExtValue())
2182 .addImm(Lo.getSExtValue())
2183 .addImm(0) // op_sel_lo
2184 .addImm(0) // op_sel_hi
2185 .addImm(0) // neg_lo
2186 .addImm(0) // neg_hi
2187 .addImm(0); // clamp
2188 } else {
2189 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2190 .addImm(Lo.getSExtValue())
2192 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2193 .addImm(Hi.getSExtValue())
2195 }
2196 } else {
2197 assert(SrcOp.isReg());
2198 if (ST.hasPkMovB32() &&
2199 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2200 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2201 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2202 .addReg(SrcOp.getReg())
2204 .addReg(SrcOp.getReg())
2205 .addImm(0) // op_sel_lo
2206 .addImm(0) // op_sel_hi
2207 .addImm(0) // neg_lo
2208 .addImm(0) // neg_hi
2209 .addImm(0); // clamp
2210 } else {
2211 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2212 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2214 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2215 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2217 }
2218 }
2219 MI.eraseFromParent();
2220 break;
2221 }
2222 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2224 break;
2225 }
2226 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2227 const MachineOperand &SrcOp = MI.getOperand(1);
2228 assert(!SrcOp.isFPImm());
2229
2230 if (ST.has64BitLiterals()) {
2231 MI.setDesc(get(AMDGPU::S_MOV_B64));
2232 break;
2233 }
2234
2235 APInt Imm(64, SrcOp.getImm());
2236 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2237 MI.setDesc(get(AMDGPU::S_MOV_B64));
2238 break;
2239 }
2240
2241 Register Dst = MI.getOperand(0).getReg();
2242 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2243 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2244
2245 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2246 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2247 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2248 .addImm(Lo.getSExtValue())
2250 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2251 .addImm(Hi.getSExtValue())
2253 MI.eraseFromParent();
2254 break;
2255 }
2256 case AMDGPU::V_SET_INACTIVE_B32: {
2257 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2258 Register DstReg = MI.getOperand(0).getReg();
2259 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2260 .add(MI.getOperand(3))
2261 .add(MI.getOperand(4))
2262 .add(MI.getOperand(1))
2263 .add(MI.getOperand(2))
2264 .add(MI.getOperand(5));
2265 MI.eraseFromParent();
2266 break;
2267 }
2268 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2269 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2270 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2271 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2272 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2273 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2274 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2275 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2276 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2277 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2278 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2279 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2280 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2281 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2282 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2283 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2284 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2285 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2286 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2287 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2288 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2289 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2290 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2291 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2292 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2293 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2294 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2295 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2296 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2297 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2298 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2299 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2300 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2301 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2302
2303 unsigned Opc;
2304 if (RI.hasVGPRs(EltRC)) {
2305 Opc = AMDGPU::V_MOVRELD_B32_e32;
2306 } else {
2307 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2308 : AMDGPU::S_MOVRELD_B32;
2309 }
2310
2311 const MCInstrDesc &OpDesc = get(Opc);
2312 Register VecReg = MI.getOperand(0).getReg();
2313 bool IsUndef = MI.getOperand(1).isUndef();
2314 unsigned SubReg = MI.getOperand(3).getImm();
2315 assert(VecReg == MI.getOperand(1).getReg());
2316
2318 BuildMI(MBB, MI, DL, OpDesc)
2319 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2320 .add(MI.getOperand(2))
2322 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2323
2324 const int ImpDefIdx =
2325 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2326 const int ImpUseIdx = ImpDefIdx + 1;
2327 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2328 MI.eraseFromParent();
2329 break;
2330 }
2331 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2332 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2333 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2334 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2335 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2336 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2337 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2338 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2339 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2340 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2341 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2342 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2343 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2344 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2345 assert(ST.useVGPRIndexMode());
2346 Register VecReg = MI.getOperand(0).getReg();
2347 bool IsUndef = MI.getOperand(1).isUndef();
2348 MachineOperand &Idx = MI.getOperand(3);
2349 Register SubReg = MI.getOperand(4).getImm();
2350
2351 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2352 .add(Idx)
2354 SetOn->getOperand(3).setIsUndef();
2355
2356 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2358 BuildMI(MBB, MI, DL, OpDesc)
2359 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2360 .add(MI.getOperand(2))
2362 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2363
2364 const int ImpDefIdx =
2365 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2366 const int ImpUseIdx = ImpDefIdx + 1;
2367 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2368
2369 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2370
2371 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2372
2373 MI.eraseFromParent();
2374 break;
2375 }
2376 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2377 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2378 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2379 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2380 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2381 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2382 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2383 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2384 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2385 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2386 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2387 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2388 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2389 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2390 assert(ST.useVGPRIndexMode());
2391 Register Dst = MI.getOperand(0).getReg();
2392 Register VecReg = MI.getOperand(1).getReg();
2393 bool IsUndef = MI.getOperand(1).isUndef();
2394 Register Idx = MI.getOperand(2).getReg();
2395 Register SubReg = MI.getOperand(3).getImm();
2396
2397 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2398 .addReg(Idx)
2400 SetOn->getOperand(3).setIsUndef();
2401
2402 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2403 .addDef(Dst)
2404 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2405 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2406
2407 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2408
2409 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2410
2411 MI.eraseFromParent();
2412 break;
2413 }
2414 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2415 MachineFunction &MF = *MBB.getParent();
2416 Register Reg = MI.getOperand(0).getReg();
2417 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2418 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2419 MachineOperand OpLo = MI.getOperand(1);
2420 MachineOperand OpHi = MI.getOperand(2);
2421
2422 // Create a bundle so these instructions won't be re-ordered by the
2423 // post-RA scheduler.
2424 MIBundleBuilder Bundler(MBB, MI);
2425 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2426
2427 // What we want here is an offset from the value returned by s_getpc (which
2428 // is the address of the s_add_u32 instruction) to the global variable, but
2429 // since the encoding of $symbol starts 4 bytes after the start of the
2430 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2431 // small. This requires us to add 4 to the global variable offset in order
2432 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2433 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2434 // instruction.
2435
2436 int64_t Adjust = 0;
2437 if (ST.hasGetPCZeroExtension()) {
2438 // Fix up hardware that does not sign-extend the 48-bit PC value by
2439 // inserting: s_sext_i32_i16 reghi, reghi
2440 Bundler.append(
2441 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2442 Adjust += 4;
2443 }
2444
2445 if (OpLo.isGlobal())
2446 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2447 Bundler.append(
2448 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2449
2450 if (OpHi.isGlobal())
2451 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2452 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2453 .addReg(RegHi)
2454 .add(OpHi));
2455
2456 finalizeBundle(MBB, Bundler.begin());
2457
2458 MI.eraseFromParent();
2459 break;
2460 }
2461 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2462 MachineFunction &MF = *MBB.getParent();
2463 Register Reg = MI.getOperand(0).getReg();
2464 MachineOperand Op = MI.getOperand(1);
2465
2466 // Create a bundle so these instructions won't be re-ordered by the
2467 // post-RA scheduler.
2468 MIBundleBuilder Bundler(MBB, MI);
2469 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2470 if (Op.isGlobal())
2471 Op.setOffset(Op.getOffset() + 4);
2472 Bundler.append(
2473 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2474
2475 finalizeBundle(MBB, Bundler.begin());
2476
2477 MI.eraseFromParent();
2478 break;
2479 }
2480 case AMDGPU::ENTER_STRICT_WWM: {
2481 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2482 // Whole Wave Mode is entered.
2483 MI.setDesc(get(LMC.OrSaveExecOpc));
2484 break;
2485 }
2486 case AMDGPU::ENTER_STRICT_WQM: {
2487 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2488 // STRICT_WQM is entered.
2489 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2490 .addReg(LMC.ExecReg);
2491 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2492
2493 MI.eraseFromParent();
2494 break;
2495 }
2496 case AMDGPU::EXIT_STRICT_WWM:
2497 case AMDGPU::EXIT_STRICT_WQM: {
2498 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2499 // WWM/STICT_WQM is exited.
2500 MI.setDesc(get(LMC.MovOpc));
2501 break;
2502 }
2503 case AMDGPU::SI_RETURN: {
2504 const MachineFunction *MF = MBB.getParent();
2505 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2506 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2507 // Hiding the return address use with SI_RETURN may lead to extra kills in
2508 // the function and missing live-ins. We are fine in practice because callee
2509 // saved register handling ensures the register value is restored before
2510 // RET, but we need the undef flag here to appease the MachineVerifier
2511 // liveness checks.
2513 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2514 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2515
2516 MIB.copyImplicitOps(MI);
2517 MI.eraseFromParent();
2518 break;
2519 }
2520
2521 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2522 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2523 MI.setDesc(get(AMDGPU::S_MUL_U64));
2524 break;
2525
2526 case AMDGPU::S_GETPC_B64_pseudo:
2527 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2528 if (ST.hasGetPCZeroExtension()) {
2529 Register Dst = MI.getOperand(0).getReg();
2530 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2531 // Fix up hardware that does not sign-extend the 48-bit PC value by
2532 // inserting: s_sext_i32_i16 dsthi, dsthi
2533 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2534 DstHi)
2535 .addReg(DstHi);
2536 }
2537 break;
2538
2539 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2540 assert(ST.hasBF16PackedInsts());
2541 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2542 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2543 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2544 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2545 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2546 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2547 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2548 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2549 break;
2550 }
2551
2552 return true;
2553}
2554
2557 unsigned SubIdx,
2558 const MachineInstr &Orig) const {
2559
2560 // Try shrinking the instruction to remat only the part needed for current
2561 // context.
2562 // TODO: Handle more cases.
2563 unsigned Opcode = Orig.getOpcode();
2564 switch (Opcode) {
2565 case AMDGPU::S_LOAD_DWORDX16_IMM:
2566 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2567 if (SubIdx != 0)
2568 break;
2569
2570 if (I == MBB.end())
2571 break;
2572
2573 if (I->isBundled())
2574 break;
2575
2576 // Look for a single use of the register that is also a subreg.
2577 Register RegToFind = Orig.getOperand(0).getReg();
2578 MachineOperand *UseMO = nullptr;
2579 for (auto &CandMO : I->operands()) {
2580 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2581 continue;
2582 if (UseMO) {
2583 UseMO = nullptr;
2584 break;
2585 }
2586 UseMO = &CandMO;
2587 }
2588 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2589 break;
2590
2591 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2592 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2593
2594 MachineFunction *MF = MBB.getParent();
2596 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2597
2598 unsigned NewOpcode = -1;
2599 if (SubregSize == 256)
2600 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2601 else if (SubregSize == 128)
2602 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2603 else
2604 break;
2605
2606 const MCInstrDesc &TID = get(NewOpcode);
2607 const TargetRegisterClass *NewRC =
2608 RI.getAllocatableClass(getRegClass(TID, 0));
2609 MRI.setRegClass(DestReg, NewRC);
2610
2611 UseMO->setReg(DestReg);
2612 UseMO->setSubReg(AMDGPU::NoSubRegister);
2613
2614 // Use a smaller load with the desired size, possibly with updated offset.
2615 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2616 MI->setDesc(TID);
2617 MI->getOperand(0).setReg(DestReg);
2618 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2619 if (Offset) {
2620 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2621 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2622 OffsetMO->setImm(FinalOffset);
2623 }
2625 for (const MachineMemOperand *MemOp : Orig.memoperands())
2626 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2627 SubregSize / 8));
2628 MI->setMemRefs(*MF, NewMMOs);
2629
2630 MBB.insert(I, MI);
2631 return;
2632 }
2633
2634 default:
2635 break;
2636 }
2637
2638 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
2639}
2640
2641std::pair<MachineInstr*, MachineInstr*>
2643 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2644
2645 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2647 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2648 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2649 return std::pair(&MI, nullptr);
2650 }
2651
2652 MachineBasicBlock &MBB = *MI.getParent();
2653 DebugLoc DL = MBB.findDebugLoc(MI);
2654 MachineFunction *MF = MBB.getParent();
2656 Register Dst = MI.getOperand(0).getReg();
2657 unsigned Part = 0;
2658 MachineInstr *Split[2];
2659
2660 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2661 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2662 if (Dst.isPhysical()) {
2663 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2664 } else {
2665 assert(MRI.isSSA());
2666 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2667 MovDPP.addDef(Tmp);
2668 }
2669
2670 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2671 const MachineOperand &SrcOp = MI.getOperand(I);
2672 assert(!SrcOp.isFPImm());
2673 if (SrcOp.isImm()) {
2674 APInt Imm(64, SrcOp.getImm());
2675 Imm.ashrInPlace(Part * 32);
2676 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2677 } else {
2678 assert(SrcOp.isReg());
2679 Register Src = SrcOp.getReg();
2680 if (Src.isPhysical())
2681 MovDPP.addReg(RI.getSubReg(Src, Sub));
2682 else
2683 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2684 }
2685 }
2686
2687 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2688 MovDPP.addImm(MO.getImm());
2689
2690 Split[Part] = MovDPP;
2691 ++Part;
2692 }
2693
2694 if (Dst.isVirtual())
2695 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2696 .addReg(Split[0]->getOperand(0).getReg())
2697 .addImm(AMDGPU::sub0)
2698 .addReg(Split[1]->getOperand(0).getReg())
2699 .addImm(AMDGPU::sub1);
2700
2701 MI.eraseFromParent();
2702 return std::pair(Split[0], Split[1]);
2703}
2704
2705std::optional<DestSourcePair>
2707 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2708 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2709
2710 return std::nullopt;
2711}
2712
2714 AMDGPU::OpName Src0OpName,
2715 MachineOperand &Src1,
2716 AMDGPU::OpName Src1OpName) const {
2717 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2718 if (!Src0Mods)
2719 return false;
2720
2721 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2722 assert(Src1Mods &&
2723 "All commutable instructions have both src0 and src1 modifiers");
2724
2725 int Src0ModsVal = Src0Mods->getImm();
2726 int Src1ModsVal = Src1Mods->getImm();
2727
2728 Src1Mods->setImm(Src0ModsVal);
2729 Src0Mods->setImm(Src1ModsVal);
2730 return true;
2731}
2732
2734 MachineOperand &RegOp,
2735 MachineOperand &NonRegOp) {
2736 Register Reg = RegOp.getReg();
2737 unsigned SubReg = RegOp.getSubReg();
2738 bool IsKill = RegOp.isKill();
2739 bool IsDead = RegOp.isDead();
2740 bool IsUndef = RegOp.isUndef();
2741 bool IsDebug = RegOp.isDebug();
2742
2743 if (NonRegOp.isImm())
2744 RegOp.ChangeToImmediate(NonRegOp.getImm());
2745 else if (NonRegOp.isFI())
2746 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2747 else if (NonRegOp.isGlobal()) {
2748 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2749 NonRegOp.getTargetFlags());
2750 } else
2751 return nullptr;
2752
2753 // Make sure we don't reinterpret a subreg index in the target flags.
2754 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2755
2756 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2757 NonRegOp.setSubReg(SubReg);
2758
2759 return &MI;
2760}
2761
2763 MachineOperand &NonRegOp1,
2764 MachineOperand &NonRegOp2) {
2765 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2766 int64_t NonRegVal = NonRegOp1.getImm();
2767
2768 NonRegOp1.setImm(NonRegOp2.getImm());
2769 NonRegOp2.setImm(NonRegVal);
2770 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2771 NonRegOp2.setTargetFlags(TargetFlags);
2772 return &MI;
2773}
2774
2775bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2776 unsigned OpIdx1) const {
2777 const MCInstrDesc &InstDesc = MI.getDesc();
2778 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2779 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2780
2781 unsigned Opc = MI.getOpcode();
2782 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2783
2784 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2785 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2786
2787 // Swap doesn't breach constant bus or literal limits
2788 // It may move literal to position other than src0, this is not allowed
2789 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2790 // FIXME: After gfx9, literal can be in place other than Src0
2791 if (isVALU(MI)) {
2792 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2793 !isInlineConstant(MO0, OpInfo1))
2794 return false;
2795 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2796 !isInlineConstant(MO1, OpInfo0))
2797 return false;
2798 }
2799
2800 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2801 if (OpInfo1.RegClass == -1)
2802 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2803 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2804 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2805 }
2806 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2807 if (OpInfo0.RegClass == -1)
2808 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2809 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2810 isLegalRegOperand(MI, OpIdx0, MO1);
2811 }
2812
2813 // No need to check 64-bit literals since swapping does not bring new
2814 // 64-bit literals into current instruction to fold to 32-bit
2815
2816 return isImmOperandLegal(MI, OpIdx1, MO0);
2817}
2818
2820 unsigned Src0Idx,
2821 unsigned Src1Idx) const {
2822 assert(!NewMI && "this should never be used");
2823
2824 unsigned Opc = MI.getOpcode();
2825 int CommutedOpcode = commuteOpcode(Opc);
2826 if (CommutedOpcode == -1)
2827 return nullptr;
2828
2829 if (Src0Idx > Src1Idx)
2830 std::swap(Src0Idx, Src1Idx);
2831
2832 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2833 static_cast<int>(Src0Idx) &&
2834 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2835 static_cast<int>(Src1Idx) &&
2836 "inconsistency with findCommutedOpIndices");
2837
2838 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2839 return nullptr;
2840
2841 MachineInstr *CommutedMI = nullptr;
2842 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2843 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2844 if (Src0.isReg() && Src1.isReg()) {
2845 // Be sure to copy the source modifiers to the right place.
2846 CommutedMI =
2847 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2848 } else if (Src0.isReg() && !Src1.isReg()) {
2849 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2850 } else if (!Src0.isReg() && Src1.isReg()) {
2851 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2852 } else if (Src0.isImm() && Src1.isImm()) {
2853 CommutedMI = swapImmOperands(MI, Src0, Src1);
2854 } else {
2855 // FIXME: Found two non registers to commute. This does happen.
2856 return nullptr;
2857 }
2858
2859 if (CommutedMI) {
2860 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2861 Src1, AMDGPU::OpName::src1_modifiers);
2862
2863 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2864 AMDGPU::OpName::src1_sel);
2865
2866 CommutedMI->setDesc(get(CommutedOpcode));
2867 }
2868
2869 return CommutedMI;
2870}
2871
2872// This needs to be implemented because the source modifiers may be inserted
2873// between the true commutable operands, and the base
2874// TargetInstrInfo::commuteInstruction uses it.
2876 unsigned &SrcOpIdx0,
2877 unsigned &SrcOpIdx1) const {
2878 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2879}
2880
2882 unsigned &SrcOpIdx0,
2883 unsigned &SrcOpIdx1) const {
2884 if (!Desc.isCommutable())
2885 return false;
2886
2887 unsigned Opc = Desc.getOpcode();
2888 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2889 if (Src0Idx == -1)
2890 return false;
2891
2892 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2893 if (Src1Idx == -1)
2894 return false;
2895
2896 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2897}
2898
2900 int64_t BrOffset) const {
2901 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2902 // because its dest block is unanalyzable.
2903 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2904
2905 // Convert to dwords.
2906 BrOffset /= 4;
2907
2908 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2909 // from the next instruction.
2910 BrOffset -= 1;
2911
2912 return isIntN(BranchOffsetBits, BrOffset);
2913}
2914
2917 return MI.getOperand(0).getMBB();
2918}
2919
2921 for (const MachineInstr &MI : MBB->terminators()) {
2922 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2923 MI.getOpcode() == AMDGPU::SI_LOOP)
2924 return true;
2925 }
2926 return false;
2927}
2928
2930 MachineBasicBlock &DestBB,
2931 MachineBasicBlock &RestoreBB,
2932 const DebugLoc &DL, int64_t BrOffset,
2933 RegScavenger *RS) const {
2934 assert(MBB.empty() &&
2935 "new block should be inserted for expanding unconditional branch");
2936 assert(MBB.pred_size() == 1);
2937 assert(RestoreBB.empty() &&
2938 "restore block should be inserted for restoring clobbered registers");
2939
2940 MachineFunction *MF = MBB.getParent();
2943 auto I = MBB.end();
2944 auto &MCCtx = MF->getContext();
2945
2946 if (ST.useAddPC64Inst()) {
2947 MCSymbol *Offset =
2948 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2949 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2951 MCSymbol *PostAddPCLabel =
2952 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2953 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2954 auto *OffsetExpr = MCBinaryExpr::createSub(
2955 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2956 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2957 Offset->setVariableValue(OffsetExpr);
2958 return;
2959 }
2960
2961 assert(RS && "RegScavenger required for long branching");
2962
2963 // FIXME: Virtual register workaround for RegScavenger not working with empty
2964 // blocks.
2965 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2966
2967 // Note: as this is used after hazard recognizer we need to apply some hazard
2968 // workarounds directly.
2969 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2970 ST.hasVALUReadSGPRHazard();
2971 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2972 if (FlushSGPRWrites)
2973 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2975 };
2976
2977 // We need to compute the offset relative to the instruction immediately after
2978 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2979 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2980 ApplyHazardWorkarounds();
2981
2982 MCSymbol *PostGetPCLabel =
2983 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2984 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2985
2986 MCSymbol *OffsetLo =
2987 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2988 MCSymbol *OffsetHi =
2989 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2990 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2991 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2992 .addReg(PCReg, 0, AMDGPU::sub0)
2993 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2994 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2995 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2996 .addReg(PCReg, 0, AMDGPU::sub1)
2997 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2998 ApplyHazardWorkarounds();
2999
3000 // Insert the indirect branch after the other terminator.
3001 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
3002 .addReg(PCReg);
3003
3004 // If a spill is needed for the pc register pair, we need to insert a spill
3005 // restore block right before the destination block, and insert a short branch
3006 // into the old destination block's fallthrough predecessor.
3007 // e.g.:
3008 //
3009 // s_cbranch_scc0 skip_long_branch:
3010 //
3011 // long_branch_bb:
3012 // spill s[8:9]
3013 // s_getpc_b64 s[8:9]
3014 // s_add_u32 s8, s8, restore_bb
3015 // s_addc_u32 s9, s9, 0
3016 // s_setpc_b64 s[8:9]
3017 //
3018 // skip_long_branch:
3019 // foo;
3020 //
3021 // .....
3022 //
3023 // dest_bb_fallthrough_predecessor:
3024 // bar;
3025 // s_branch dest_bb
3026 //
3027 // restore_bb:
3028 // restore s[8:9]
3029 // fallthrough dest_bb
3030 ///
3031 // dest_bb:
3032 // buzz;
3033
3034 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3035 Register Scav;
3036
3037 // If we've previously reserved a register for long branches
3038 // avoid running the scavenger and just use those registers
3039 if (LongBranchReservedReg) {
3040 RS->enterBasicBlock(MBB);
3041 Scav = LongBranchReservedReg;
3042 } else {
3043 RS->enterBasicBlockEnd(MBB);
3044 Scav = RS->scavengeRegisterBackwards(
3045 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3046 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3047 }
3048 if (Scav) {
3049 RS->setRegUsed(Scav);
3050 MRI.replaceRegWith(PCReg, Scav);
3051 MRI.clearVirtRegs();
3052 } else {
3053 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3054 // SGPR spill.
3055 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3056 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3057 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3058 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3059 MRI.clearVirtRegs();
3060 }
3061
3062 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3063 // Now, the distance could be defined.
3065 MCSymbolRefExpr::create(DestLabel, MCCtx),
3066 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3067 // Add offset assignments.
3068 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3069 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3070 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3071 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3072}
3073
3074unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3075 switch (Cond) {
3076 case SIInstrInfo::SCC_TRUE:
3077 return AMDGPU::S_CBRANCH_SCC1;
3078 case SIInstrInfo::SCC_FALSE:
3079 return AMDGPU::S_CBRANCH_SCC0;
3080 case SIInstrInfo::VCCNZ:
3081 return AMDGPU::S_CBRANCH_VCCNZ;
3082 case SIInstrInfo::VCCZ:
3083 return AMDGPU::S_CBRANCH_VCCZ;
3084 case SIInstrInfo::EXECNZ:
3085 return AMDGPU::S_CBRANCH_EXECNZ;
3086 case SIInstrInfo::EXECZ:
3087 return AMDGPU::S_CBRANCH_EXECZ;
3088 default:
3089 llvm_unreachable("invalid branch predicate");
3090 }
3091}
3092
3093SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3094 switch (Opcode) {
3095 case AMDGPU::S_CBRANCH_SCC0:
3096 return SCC_FALSE;
3097 case AMDGPU::S_CBRANCH_SCC1:
3098 return SCC_TRUE;
3099 case AMDGPU::S_CBRANCH_VCCNZ:
3100 return VCCNZ;
3101 case AMDGPU::S_CBRANCH_VCCZ:
3102 return VCCZ;
3103 case AMDGPU::S_CBRANCH_EXECNZ:
3104 return EXECNZ;
3105 case AMDGPU::S_CBRANCH_EXECZ:
3106 return EXECZ;
3107 default:
3108 return INVALID_BR;
3109 }
3110}
3111
3115 MachineBasicBlock *&FBB,
3117 bool AllowModify) const {
3118 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3119 // Unconditional Branch
3120 TBB = I->getOperand(0).getMBB();
3121 return false;
3122 }
3123
3124 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3125 if (Pred == INVALID_BR)
3126 return true;
3127
3128 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3129 Cond.push_back(MachineOperand::CreateImm(Pred));
3130 Cond.push_back(I->getOperand(1)); // Save the branch register.
3131
3132 ++I;
3133
3134 if (I == MBB.end()) {
3135 // Conditional branch followed by fall-through.
3136 TBB = CondBB;
3137 return false;
3138 }
3139
3140 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3141 TBB = CondBB;
3142 FBB = I->getOperand(0).getMBB();
3143 return false;
3144 }
3145
3146 return true;
3147}
3148
3150 MachineBasicBlock *&FBB,
3152 bool AllowModify) const {
3153 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3154 auto E = MBB.end();
3155 if (I == E)
3156 return false;
3157
3158 // Skip over the instructions that are artificially terminators for special
3159 // exec management.
3160 while (I != E && !I->isBranch() && !I->isReturn()) {
3161 switch (I->getOpcode()) {
3162 case AMDGPU::S_MOV_B64_term:
3163 case AMDGPU::S_XOR_B64_term:
3164 case AMDGPU::S_OR_B64_term:
3165 case AMDGPU::S_ANDN2_B64_term:
3166 case AMDGPU::S_AND_B64_term:
3167 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3168 case AMDGPU::S_MOV_B32_term:
3169 case AMDGPU::S_XOR_B32_term:
3170 case AMDGPU::S_OR_B32_term:
3171 case AMDGPU::S_ANDN2_B32_term:
3172 case AMDGPU::S_AND_B32_term:
3173 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3174 break;
3175 case AMDGPU::SI_IF:
3176 case AMDGPU::SI_ELSE:
3177 case AMDGPU::SI_KILL_I1_TERMINATOR:
3178 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3179 // FIXME: It's messy that these need to be considered here at all.
3180 return true;
3181 default:
3182 llvm_unreachable("unexpected non-branch terminator inst");
3183 }
3184
3185 ++I;
3186 }
3187
3188 if (I == E)
3189 return false;
3190
3191 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3192}
3193
3195 int *BytesRemoved) const {
3196 unsigned Count = 0;
3197 unsigned RemovedSize = 0;
3198 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3199 // Skip over artificial terminators when removing instructions.
3200 if (MI.isBranch() || MI.isReturn()) {
3201 RemovedSize += getInstSizeInBytes(MI);
3202 MI.eraseFromParent();
3203 ++Count;
3204 }
3205 }
3206
3207 if (BytesRemoved)
3208 *BytesRemoved = RemovedSize;
3209
3210 return Count;
3211}
3212
3213// Copy the flags onto the implicit condition register operand.
3215 const MachineOperand &OrigCond) {
3216 CondReg.setIsUndef(OrigCond.isUndef());
3217 CondReg.setIsKill(OrigCond.isKill());
3218}
3219
3222 MachineBasicBlock *FBB,
3224 const DebugLoc &DL,
3225 int *BytesAdded) const {
3226 if (!FBB && Cond.empty()) {
3227 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3228 .addMBB(TBB);
3229 if (BytesAdded)
3230 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3231 return 1;
3232 }
3233
3234 assert(TBB && Cond[0].isImm());
3235
3236 unsigned Opcode
3237 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3238
3239 if (!FBB) {
3240 MachineInstr *CondBr =
3241 BuildMI(&MBB, DL, get(Opcode))
3242 .addMBB(TBB);
3243
3244 // Copy the flags onto the implicit condition register operand.
3245 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3246 fixImplicitOperands(*CondBr);
3247
3248 if (BytesAdded)
3249 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3250 return 1;
3251 }
3252
3253 assert(TBB && FBB);
3254
3255 MachineInstr *CondBr =
3256 BuildMI(&MBB, DL, get(Opcode))
3257 .addMBB(TBB);
3258 fixImplicitOperands(*CondBr);
3259 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3260 .addMBB(FBB);
3261
3262 MachineOperand &CondReg = CondBr->getOperand(1);
3263 CondReg.setIsUndef(Cond[1].isUndef());
3264 CondReg.setIsKill(Cond[1].isKill());
3265
3266 if (BytesAdded)
3267 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3268
3269 return 2;
3270}
3271
3274 if (Cond.size() != 2) {
3275 return true;
3276 }
3277
3278 if (Cond[0].isImm()) {
3279 Cond[0].setImm(-Cond[0].getImm());
3280 return false;
3281 }
3282
3283 return true;
3284}
3285
3288 Register DstReg, Register TrueReg,
3289 Register FalseReg, int &CondCycles,
3290 int &TrueCycles, int &FalseCycles) const {
3291 switch (Cond[0].getImm()) {
3292 case VCCNZ:
3293 case VCCZ: {
3294 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3295 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3296 if (MRI.getRegClass(FalseReg) != RC)
3297 return false;
3298
3299 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3300 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3301
3302 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3303 return RI.hasVGPRs(RC) && NumInsts <= 6;
3304 }
3305 case SCC_TRUE:
3306 case SCC_FALSE: {
3307 // FIXME: We could insert for VGPRs if we could replace the original compare
3308 // with a vector one.
3309 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3310 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3311 if (MRI.getRegClass(FalseReg) != RC)
3312 return false;
3313
3314 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3315
3316 // Multiples of 8 can do s_cselect_b64
3317 if (NumInsts % 2 == 0)
3318 NumInsts /= 2;
3319
3320 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3321 return RI.isSGPRClass(RC);
3322 }
3323 default:
3324 return false;
3325 }
3326}
3327
3331 Register TrueReg, Register FalseReg) const {
3332 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3333 if (Pred == VCCZ || Pred == SCC_FALSE) {
3334 Pred = static_cast<BranchPredicate>(-Pred);
3335 std::swap(TrueReg, FalseReg);
3336 }
3337
3338 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3339 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3340 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3341
3342 if (DstSize == 32) {
3344 if (Pred == SCC_TRUE) {
3345 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3346 .addReg(TrueReg)
3347 .addReg(FalseReg);
3348 } else {
3349 // Instruction's operands are backwards from what is expected.
3350 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3351 .addReg(FalseReg)
3352 .addReg(TrueReg);
3353 }
3354
3355 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3356 return;
3357 }
3358
3359 if (DstSize == 64 && Pred == SCC_TRUE) {
3361 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3362 .addReg(TrueReg)
3363 .addReg(FalseReg);
3364
3365 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3366 return;
3367 }
3368
3369 static const int16_t Sub0_15[] = {
3370 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3371 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3372 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3373 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3374 };
3375
3376 static const int16_t Sub0_15_64[] = {
3377 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3378 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3379 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3380 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3381 };
3382
3383 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3384 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3385 const int16_t *SubIndices = Sub0_15;
3386 int NElts = DstSize / 32;
3387
3388 // 64-bit select is only available for SALU.
3389 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3390 if (Pred == SCC_TRUE) {
3391 if (NElts % 2) {
3392 SelOp = AMDGPU::S_CSELECT_B32;
3393 EltRC = &AMDGPU::SGPR_32RegClass;
3394 } else {
3395 SelOp = AMDGPU::S_CSELECT_B64;
3396 EltRC = &AMDGPU::SGPR_64RegClass;
3397 SubIndices = Sub0_15_64;
3398 NElts /= 2;
3399 }
3400 }
3401
3403 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3404
3405 I = MIB->getIterator();
3406
3408 for (int Idx = 0; Idx != NElts; ++Idx) {
3409 Register DstElt = MRI.createVirtualRegister(EltRC);
3410 Regs.push_back(DstElt);
3411
3412 unsigned SubIdx = SubIndices[Idx];
3413
3415 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3416 Select =
3417 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3418 .addReg(FalseReg, 0, SubIdx)
3419 .addReg(TrueReg, 0, SubIdx);
3420 } else {
3421 Select =
3422 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3423 .addReg(TrueReg, 0, SubIdx)
3424 .addReg(FalseReg, 0, SubIdx);
3425 }
3426
3427 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3429
3430 MIB.addReg(DstElt)
3431 .addImm(SubIdx);
3432 }
3433}
3434
3436 switch (MI.getOpcode()) {
3437 case AMDGPU::V_MOV_B16_t16_e32:
3438 case AMDGPU::V_MOV_B16_t16_e64:
3439 case AMDGPU::V_MOV_B32_e32:
3440 case AMDGPU::V_MOV_B32_e64:
3441 case AMDGPU::V_MOV_B64_PSEUDO:
3442 case AMDGPU::V_MOV_B64_e32:
3443 case AMDGPU::V_MOV_B64_e64:
3444 case AMDGPU::S_MOV_B32:
3445 case AMDGPU::S_MOV_B64:
3446 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3447 case AMDGPU::COPY:
3448 case AMDGPU::WWM_COPY:
3449 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3450 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3451 case AMDGPU::V_ACCVGPR_MOV_B32:
3452 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3453 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3454 return true;
3455 default:
3456 return false;
3457 }
3458}
3459
3461 switch (MI.getOpcode()) {
3462 case AMDGPU::V_MOV_B16_t16_e32:
3463 case AMDGPU::V_MOV_B16_t16_e64:
3464 return 2;
3465 case AMDGPU::V_MOV_B32_e32:
3466 case AMDGPU::V_MOV_B32_e64:
3467 case AMDGPU::V_MOV_B64_PSEUDO:
3468 case AMDGPU::V_MOV_B64_e32:
3469 case AMDGPU::V_MOV_B64_e64:
3470 case AMDGPU::S_MOV_B32:
3471 case AMDGPU::S_MOV_B64:
3472 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3473 case AMDGPU::COPY:
3474 case AMDGPU::WWM_COPY:
3475 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3476 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3477 case AMDGPU::V_ACCVGPR_MOV_B32:
3478 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3479 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3480 return 1;
3481 default:
3482 llvm_unreachable("MI is not a foldable copy");
3483 }
3484}
3485
3486static constexpr AMDGPU::OpName ModifierOpNames[] = {
3487 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3488 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3489 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3490
3492 unsigned Opc = MI.getOpcode();
3493 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3494 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3495 if (Idx >= 0)
3496 MI.removeOperand(Idx);
3497 }
3498}
3499
3501 const MCInstrDesc &NewDesc) const {
3502 MI.setDesc(NewDesc);
3503
3504 // Remove any leftover implicit operands from mutating the instruction. e.g.
3505 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3506 // anymore.
3507 const MCInstrDesc &Desc = MI.getDesc();
3508 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3509 Desc.implicit_defs().size();
3510
3511 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3512 MI.removeOperand(I);
3513}
3514
3515std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3516 unsigned SubRegIndex) {
3517 switch (SubRegIndex) {
3518 case AMDGPU::NoSubRegister:
3519 return Imm;
3520 case AMDGPU::sub0:
3521 return SignExtend64<32>(Imm);
3522 case AMDGPU::sub1:
3523 return SignExtend64<32>(Imm >> 32);
3524 case AMDGPU::lo16:
3525 return SignExtend64<16>(Imm);
3526 case AMDGPU::hi16:
3527 return SignExtend64<16>(Imm >> 16);
3528 case AMDGPU::sub1_lo16:
3529 return SignExtend64<16>(Imm >> 32);
3530 case AMDGPU::sub1_hi16:
3531 return SignExtend64<16>(Imm >> 48);
3532 default:
3533 return std::nullopt;
3534 }
3535
3536 llvm_unreachable("covered subregister switch");
3537}
3538
3539static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3540 switch (Opc) {
3541 case AMDGPU::V_MAC_F16_e32:
3542 case AMDGPU::V_MAC_F16_e64:
3543 case AMDGPU::V_MAD_F16_e64:
3544 return AMDGPU::V_MADAK_F16;
3545 case AMDGPU::V_MAC_F32_e32:
3546 case AMDGPU::V_MAC_F32_e64:
3547 case AMDGPU::V_MAD_F32_e64:
3548 return AMDGPU::V_MADAK_F32;
3549 case AMDGPU::V_FMAC_F32_e32:
3550 case AMDGPU::V_FMAC_F32_e64:
3551 case AMDGPU::V_FMA_F32_e64:
3552 return AMDGPU::V_FMAAK_F32;
3553 case AMDGPU::V_FMAC_F16_e32:
3554 case AMDGPU::V_FMAC_F16_e64:
3555 case AMDGPU::V_FMAC_F16_t16_e64:
3556 case AMDGPU::V_FMAC_F16_fake16_e64:
3557 case AMDGPU::V_FMAC_F16_t16_e32:
3558 case AMDGPU::V_FMAC_F16_fake16_e32:
3559 case AMDGPU::V_FMA_F16_e64:
3560 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3561 ? AMDGPU::V_FMAAK_F16_t16
3562 : AMDGPU::V_FMAAK_F16_fake16
3563 : AMDGPU::V_FMAAK_F16;
3564 case AMDGPU::V_FMAC_F64_e32:
3565 case AMDGPU::V_FMAC_F64_e64:
3566 case AMDGPU::V_FMA_F64_e64:
3567 return AMDGPU::V_FMAAK_F64;
3568 default:
3569 llvm_unreachable("invalid instruction");
3570 }
3571}
3572
3573static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3574 switch (Opc) {
3575 case AMDGPU::V_MAC_F16_e32:
3576 case AMDGPU::V_MAC_F16_e64:
3577 case AMDGPU::V_MAD_F16_e64:
3578 return AMDGPU::V_MADMK_F16;
3579 case AMDGPU::V_MAC_F32_e32:
3580 case AMDGPU::V_MAC_F32_e64:
3581 case AMDGPU::V_MAD_F32_e64:
3582 return AMDGPU::V_MADMK_F32;
3583 case AMDGPU::V_FMAC_F32_e32:
3584 case AMDGPU::V_FMAC_F32_e64:
3585 case AMDGPU::V_FMA_F32_e64:
3586 return AMDGPU::V_FMAMK_F32;
3587 case AMDGPU::V_FMAC_F16_e32:
3588 case AMDGPU::V_FMAC_F16_e64:
3589 case AMDGPU::V_FMAC_F16_t16_e64:
3590 case AMDGPU::V_FMAC_F16_fake16_e64:
3591 case AMDGPU::V_FMAC_F16_t16_e32:
3592 case AMDGPU::V_FMAC_F16_fake16_e32:
3593 case AMDGPU::V_FMA_F16_e64:
3594 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3595 ? AMDGPU::V_FMAMK_F16_t16
3596 : AMDGPU::V_FMAMK_F16_fake16
3597 : AMDGPU::V_FMAMK_F16;
3598 case AMDGPU::V_FMAC_F64_e32:
3599 case AMDGPU::V_FMAC_F64_e64:
3600 case AMDGPU::V_FMA_F64_e64:
3601 return AMDGPU::V_FMAMK_F64;
3602 default:
3603 llvm_unreachable("invalid instruction");
3604 }
3605}
3606
3608 Register Reg, MachineRegisterInfo *MRI) const {
3609 int64_t Imm;
3610 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3611 return false;
3612
3613 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3614
3615 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3616
3617 unsigned Opc = UseMI.getOpcode();
3618 if (Opc == AMDGPU::COPY) {
3619 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3620
3621 Register DstReg = UseMI.getOperand(0).getReg();
3622 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3623
3624 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3625
3626 if (HasMultipleUses) {
3627 // TODO: This should fold in more cases with multiple use, but we need to
3628 // more carefully consider what those uses are.
3629 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3630
3631 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3632 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3633 return false;
3634
3635 // Most of the time folding a 32-bit inline constant is free (though this
3636 // might not be true if we can't later fold it into a real user).
3637 //
3638 // FIXME: This isInlineConstant check is imprecise if
3639 // getConstValDefinedInReg handled the tricky non-mov cases.
3640 if (ImmDefSize == 32 &&
3642 return false;
3643 }
3644
3645 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3646 RI.getSubRegIdxSize(UseSubReg) == 16;
3647
3648 if (Is16Bit) {
3649 if (RI.hasVGPRs(DstRC))
3650 return false; // Do not clobber vgpr_hi16
3651
3652 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3653 return false;
3654 }
3655
3656 MachineFunction *MF = UseMI.getMF();
3657
3658 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3659 MCRegister MovDstPhysReg =
3660 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3661
3662 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3663
3664 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3665 for (unsigned MovOp :
3666 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3667 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3668 const MCInstrDesc &MovDesc = get(MovOp);
3669
3670 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3671 if (Is16Bit) {
3672 // We just need to find a correctly sized register class, so the
3673 // subregister index compatibility doesn't matter since we're statically
3674 // extracting the immediate value.
3675 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3676 if (!MovDstRC)
3677 continue;
3678
3679 if (MovDstPhysReg) {
3680 // FIXME: We probably should not do this. If there is a live value in
3681 // the high half of the register, it will be corrupted.
3682 MovDstPhysReg =
3683 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3684 if (!MovDstPhysReg)
3685 continue;
3686 }
3687 }
3688
3689 // Result class isn't the right size, try the next instruction.
3690 if (MovDstPhysReg) {
3691 if (!MovDstRC->contains(MovDstPhysReg))
3692 return false;
3693 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3694 // TODO: This will be overly conservative in the case of 16-bit virtual
3695 // SGPRs. We could hack up the virtual register uses to use a compatible
3696 // 32-bit class.
3697 continue;
3698 }
3699
3700 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3701
3702 // Ensure the interpreted immediate value is a valid operand in the new
3703 // mov.
3704 //
3705 // FIXME: isImmOperandLegal should have form that doesn't require existing
3706 // MachineInstr or MachineOperand
3707 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3708 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3709 break;
3710
3711 NewOpc = MovOp;
3712 break;
3713 }
3714
3715 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3716 return false;
3717
3718 if (Is16Bit) {
3719 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3720 if (MovDstPhysReg)
3721 UseMI.getOperand(0).setReg(MovDstPhysReg);
3722 assert(UseMI.getOperand(1).getReg().isVirtual());
3723 }
3724
3725 const MCInstrDesc &NewMCID = get(NewOpc);
3726 UseMI.setDesc(NewMCID);
3727 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3728 UseMI.addImplicitDefUseOperands(*MF);
3729 return true;
3730 }
3731
3732 if (HasMultipleUses)
3733 return false;
3734
3735 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3736 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3737 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3738 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3739 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3740 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3741 Opc == AMDGPU::V_FMAC_F64_e64) {
3742 // Don't fold if we are using source or output modifiers. The new VOP2
3743 // instructions don't have them.
3745 return false;
3746
3747 // If this is a free constant, there's no reason to do this.
3748 // TODO: We could fold this here instead of letting SIFoldOperands do it
3749 // later.
3750 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3751
3752 // Any src operand can be used for the legality check.
3753 if (isInlineConstant(UseMI, Src0Idx, Imm))
3754 return false;
3755
3756 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3757
3758 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3759 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3760
3761 auto CopyRegOperandToNarrowerRC =
3762 [MRI, this](MachineInstr &MI, unsigned OpNo,
3763 const TargetRegisterClass *NewRC) -> void {
3764 if (!MI.getOperand(OpNo).isReg())
3765 return;
3766 Register Reg = MI.getOperand(OpNo).getReg();
3767 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3768 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3769 return;
3770 Register Tmp = MRI->createVirtualRegister(NewRC);
3771 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3772 get(AMDGPU::COPY), Tmp)
3773 .addReg(Reg);
3774 MI.getOperand(OpNo).setReg(Tmp);
3775 MI.getOperand(OpNo).setIsKill();
3776 };
3777
3778 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3779 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3780 (Src1->isReg() && Src1->getReg() == Reg)) {
3781 MachineOperand *RegSrc =
3782 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3783 if (!RegSrc->isReg())
3784 return false;
3785 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3786 ST.getConstantBusLimit(Opc) < 2)
3787 return false;
3788
3789 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3790 return false;
3791
3792 // If src2 is also a literal constant then we have to choose which one to
3793 // fold. In general it is better to choose madak so that the other literal
3794 // can be materialized in an sgpr instead of a vgpr:
3795 // s_mov_b32 s0, literal
3796 // v_madak_f32 v0, s0, v0, literal
3797 // Instead of:
3798 // v_mov_b32 v1, literal
3799 // v_madmk_f32 v0, v0, literal, v1
3800 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3801 if (Def && Def->isMoveImmediate() &&
3802 !isInlineConstant(Def->getOperand(1)))
3803 return false;
3804
3805 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3806 if (pseudoToMCOpcode(NewOpc) == -1)
3807 return false;
3808
3809 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3810 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3811
3812 // FIXME: This would be a lot easier if we could return a new instruction
3813 // instead of having to modify in place.
3814
3815 Register SrcReg = RegSrc->getReg();
3816 unsigned SrcSubReg = RegSrc->getSubReg();
3817 Src0->setReg(SrcReg);
3818 Src0->setSubReg(SrcSubReg);
3819 Src0->setIsKill(RegSrc->isKill());
3820
3821 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3822 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3823 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3824 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3825 UseMI.untieRegOperand(
3826 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3827
3828 Src1->ChangeToImmediate(*SubRegImm);
3829
3831 UseMI.setDesc(get(NewOpc));
3832
3833 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3834 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3835 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3836 Register Tmp = MRI->createVirtualRegister(NewRC);
3837 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3838 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3839 UseMI.getOperand(0).getReg())
3840 .addReg(Tmp, RegState::Kill);
3841 UseMI.getOperand(0).setReg(Tmp);
3842 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3843 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3844 }
3845
3846 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3847 if (DeleteDef)
3848 DefMI.eraseFromParent();
3849
3850 return true;
3851 }
3852
3853 // Added part is the constant: Use v_madak_{f16, f32}.
3854 if (Src2->isReg() && Src2->getReg() == Reg) {
3855 if (ST.getConstantBusLimit(Opc) < 2) {
3856 // Not allowed to use constant bus for another operand.
3857 // We can however allow an inline immediate as src0.
3858 bool Src0Inlined = false;
3859 if (Src0->isReg()) {
3860 // Try to inline constant if possible.
3861 // If the Def moves immediate and the use is single
3862 // We are saving VGPR here.
3863 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3864 if (Def && Def->isMoveImmediate() &&
3865 isInlineConstant(Def->getOperand(1)) &&
3866 MRI->hasOneNonDBGUse(Src0->getReg())) {
3867 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3868 Src0Inlined = true;
3869 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3870 RI.isSGPRReg(*MRI, Src0->getReg())) {
3871 return false;
3872 }
3873 // VGPR is okay as Src0 - fallthrough
3874 }
3875
3876 if (Src1->isReg() && !Src0Inlined) {
3877 // We have one slot for inlinable constant so far - try to fill it
3878 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3879 if (Def && Def->isMoveImmediate() &&
3880 isInlineConstant(Def->getOperand(1)) &&
3881 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3882 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3883 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3884 return false;
3885 // VGPR is okay as Src1 - fallthrough
3886 }
3887 }
3888
3889 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3890 if (pseudoToMCOpcode(NewOpc) == -1)
3891 return false;
3892
3893 // FIXME: This would be a lot easier if we could return a new instruction
3894 // instead of having to modify in place.
3895
3896 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3897 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3898 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3899 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3900 UseMI.untieRegOperand(
3901 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3902
3903 const std::optional<int64_t> SubRegImm =
3904 extractSubregFromImm(Imm, Src2->getSubReg());
3905
3906 // ChangingToImmediate adds Src2 back to the instruction.
3907 Src2->ChangeToImmediate(*SubRegImm);
3908
3909 // These come before src2.
3911 UseMI.setDesc(get(NewOpc));
3912
3913 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3914 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3915 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3916 Register Tmp = MRI->createVirtualRegister(NewRC);
3917 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3918 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3919 UseMI.getOperand(0).getReg())
3920 .addReg(Tmp, RegState::Kill);
3921 UseMI.getOperand(0).setReg(Tmp);
3922 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3923 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
3924 }
3925
3926 // It might happen that UseMI was commuted
3927 // and we now have SGPR as SRC1. If so 2 inlined
3928 // constant and SGPR are illegal.
3930
3931 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3932 if (DeleteDef)
3933 DefMI.eraseFromParent();
3934
3935 return true;
3936 }
3937 }
3938
3939 return false;
3940}
3941
3942static bool
3945 if (BaseOps1.size() != BaseOps2.size())
3946 return false;
3947 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3948 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3949 return false;
3950 }
3951 return true;
3952}
3953
3954static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3955 LocationSize WidthB, int OffsetB) {
3956 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3957 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3958 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3959 return LowWidth.hasValue() &&
3960 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3961}
3962
3963bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3964 const MachineInstr &MIb) const {
3965 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3966 int64_t Offset0, Offset1;
3967 LocationSize Dummy0 = LocationSize::precise(0);
3968 LocationSize Dummy1 = LocationSize::precise(0);
3969 bool Offset0IsScalable, Offset1IsScalable;
3970 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3971 Dummy0, &RI) ||
3972 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3973 Dummy1, &RI))
3974 return false;
3975
3976 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3977 return false;
3978
3979 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3980 // FIXME: Handle ds_read2 / ds_write2.
3981 return false;
3982 }
3983 LocationSize Width0 = MIa.memoperands().front()->getSize();
3984 LocationSize Width1 = MIb.memoperands().front()->getSize();
3985 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3986}
3987
3989 const MachineInstr &MIb) const {
3990 assert(MIa.mayLoadOrStore() &&
3991 "MIa must load from or modify a memory location");
3992 assert(MIb.mayLoadOrStore() &&
3993 "MIb must load from or modify a memory location");
3994
3996 return false;
3997
3998 // XXX - Can we relax this between address spaces?
3999 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4000 return false;
4001
4002 if (isLDSDMA(MIa) || isLDSDMA(MIb))
4003 return false;
4004
4005 if (MIa.isBundle() || MIb.isBundle())
4006 return false;
4007
4008 // TODO: Should we check the address space from the MachineMemOperand? That
4009 // would allow us to distinguish objects we know don't alias based on the
4010 // underlying address space, even if it was lowered to a different one,
4011 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4012 // buffer.
4013 if (isDS(MIa)) {
4014 if (isDS(MIb))
4015 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4016
4017 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4018 }
4019
4020 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4021 if (isMUBUF(MIb) || isMTBUF(MIb))
4022 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4023
4024 if (isFLAT(MIb))
4025 return isFLATScratch(MIb);
4026
4027 return !isSMRD(MIb);
4028 }
4029
4030 if (isSMRD(MIa)) {
4031 if (isSMRD(MIb))
4032 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4033
4034 if (isFLAT(MIb))
4035 return isFLATScratch(MIb);
4036
4037 return !isMUBUF(MIb) && !isMTBUF(MIb);
4038 }
4039
4040 if (isFLAT(MIa)) {
4041 if (isFLAT(MIb)) {
4042 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4043 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4044 return true;
4045
4046 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4047 }
4048
4049 return false;
4050 }
4051
4052 return false;
4053}
4054
4056 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4057 if (Reg.isPhysical())
4058 return false;
4059 auto *Def = MRI.getUniqueVRegDef(Reg);
4060 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4061 Imm = Def->getOperand(1).getImm();
4062 if (DefMI)
4063 *DefMI = Def;
4064 return true;
4065 }
4066 return false;
4067}
4068
4069static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4070 MachineInstr **DefMI = nullptr) {
4071 if (!MO->isReg())
4072 return false;
4073 const MachineFunction *MF = MO->getParent()->getMF();
4074 const MachineRegisterInfo &MRI = MF->getRegInfo();
4075 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4076}
4077
4079 MachineInstr &NewMI) {
4080 if (LV) {
4081 unsigned NumOps = MI.getNumOperands();
4082 for (unsigned I = 1; I < NumOps; ++I) {
4083 MachineOperand &Op = MI.getOperand(I);
4084 if (Op.isReg() && Op.isKill())
4085 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4086 }
4087 }
4088}
4089
4090static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4091 switch (Opc) {
4092 case AMDGPU::V_MAC_F16_e32:
4093 case AMDGPU::V_MAC_F16_e64:
4094 return AMDGPU::V_MAD_F16_e64;
4095 case AMDGPU::V_MAC_F32_e32:
4096 case AMDGPU::V_MAC_F32_e64:
4097 return AMDGPU::V_MAD_F32_e64;
4098 case AMDGPU::V_MAC_LEGACY_F32_e32:
4099 case AMDGPU::V_MAC_LEGACY_F32_e64:
4100 return AMDGPU::V_MAD_LEGACY_F32_e64;
4101 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4102 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4103 return AMDGPU::V_FMA_LEGACY_F32_e64;
4104 case AMDGPU::V_FMAC_F16_e32:
4105 case AMDGPU::V_FMAC_F16_e64:
4106 case AMDGPU::V_FMAC_F16_t16_e64:
4107 case AMDGPU::V_FMAC_F16_fake16_e64:
4108 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4109 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4110 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4111 : AMDGPU::V_FMA_F16_gfx9_e64;
4112 case AMDGPU::V_FMAC_F32_e32:
4113 case AMDGPU::V_FMAC_F32_e64:
4114 return AMDGPU::V_FMA_F32_e64;
4115 case AMDGPU::V_FMAC_F64_e32:
4116 case AMDGPU::V_FMAC_F64_e64:
4117 return AMDGPU::V_FMA_F64_e64;
4118 default:
4119 llvm_unreachable("invalid instruction");
4120 }
4121}
4122
4123/// Helper struct for the implementation of 3-address conversion to communicate
4124/// updates made to instruction operands.
4126 /// Other instruction whose def is no longer used by the converted
4127 /// instruction.
4129};
4130
4132 LiveVariables *LV,
4133 LiveIntervals *LIS) const {
4134 MachineBasicBlock &MBB = *MI.getParent();
4135 MachineInstr *CandidateMI = &MI;
4136
4137 if (MI.isBundle()) {
4138 // This is a temporary placeholder for bundle handling that enables us to
4139 // exercise the relevant code paths in the two-address instruction pass.
4140 if (MI.getBundleSize() != 1)
4141 return nullptr;
4142 CandidateMI = MI.getNextNode();
4143 }
4144
4146 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4147 if (!NewMI)
4148 return nullptr;
4149
4150 if (MI.isBundle()) {
4151 CandidateMI->eraseFromBundle();
4152
4153 for (MachineOperand &MO : MI.all_defs()) {
4154 if (MO.isTied())
4155 MI.untieRegOperand(MO.getOperandNo());
4156 }
4157 } else {
4158 updateLiveVariables(LV, MI, *NewMI);
4159 if (LIS) {
4160 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4161 // SlotIndex of defs needs to be updated when converting to early-clobber
4162 MachineOperand &Def = NewMI->getOperand(0);
4163 if (Def.isEarlyClobber() && Def.isReg() &&
4164 LIS->hasInterval(Def.getReg())) {
4165 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4166 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4167 auto &LI = LIS->getInterval(Def.getReg());
4168 auto UpdateDefIndex = [&](LiveRange &LR) {
4169 auto *S = LR.find(OldIndex);
4170 if (S != LR.end() && S->start == OldIndex) {
4171 assert(S->valno && S->valno->def == OldIndex);
4172 S->start = NewIndex;
4173 S->valno->def = NewIndex;
4174 }
4175 };
4176 UpdateDefIndex(LI);
4177 for (auto &SR : LI.subranges())
4178 UpdateDefIndex(SR);
4179 }
4180 }
4181 }
4182
4183 if (U.RemoveMIUse) {
4184 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4185 // The only user is the instruction which will be killed.
4186 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4187
4188 if (MRI.hasOneNonDBGUse(DefReg)) {
4189 // We cannot just remove the DefMI here, calling pass will crash.
4190 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4191 U.RemoveMIUse->getOperand(0).setIsDead(true);
4192 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4193 U.RemoveMIUse->removeOperand(I);
4194 if (LV)
4195 LV->getVarInfo(DefReg).AliveBlocks.clear();
4196 }
4197
4198 if (MI.isBundle()) {
4199 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4200 if (!VRI.Reads && !VRI.Writes) {
4201 for (MachineOperand &MO : MI.all_uses()) {
4202 if (MO.isReg() && MO.getReg() == DefReg) {
4203 assert(MO.getSubReg() == 0 &&
4204 "tied sub-registers in bundles currently not supported");
4205 MI.removeOperand(MO.getOperandNo());
4206 break;
4207 }
4208 }
4209
4210 if (LIS)
4211 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4212 }
4213 } else if (LIS) {
4214 LiveInterval &DefLI = LIS->getInterval(DefReg);
4215
4216 // We cannot delete the original instruction here, so hack out the use
4217 // in the original instruction with a dummy register so we can use
4218 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4219 // not have the complexity of deleting a use to consider here.
4220 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4221 for (MachineOperand &MIOp : MI.uses()) {
4222 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4223 MIOp.setIsUndef(true);
4224 MIOp.setReg(DummyReg);
4225 }
4226 }
4227
4228 if (MI.isBundle()) {
4229 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4230 if (!VRI.Reads && !VRI.Writes) {
4231 for (MachineOperand &MIOp : MI.uses()) {
4232 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4233 MIOp.setIsUndef(true);
4234 MIOp.setReg(DummyReg);
4235 }
4236 }
4237 }
4238
4239 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4240 false, /*isUndef=*/true));
4241 }
4242
4243 LIS->shrinkToUses(&DefLI);
4244 }
4245 }
4246
4247 return MI.isBundle() ? &MI : NewMI;
4248}
4249
4251SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4252 ThreeAddressUpdates &U) const {
4253 MachineBasicBlock &MBB = *MI.getParent();
4254 unsigned Opc = MI.getOpcode();
4255
4256 // Handle MFMA.
4257 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4258 if (NewMFMAOpc != -1) {
4260 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4261 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4262 MIB.add(MI.getOperand(I));
4263 return MIB;
4264 }
4265
4266 if (SIInstrInfo::isWMMA(MI)) {
4267 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4268 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4269 .setMIFlags(MI.getFlags());
4270 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4271 MIB->addOperand(MI.getOperand(I));
4272 return MIB;
4273 }
4274
4275 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4276 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4277 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4278 "present pre-RA");
4279
4280 // Handle MAC/FMAC.
4281 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4282 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4283 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4284 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4285 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4286 bool Src0Literal = false;
4287
4288 switch (Opc) {
4289 default:
4290 return nullptr;
4291 case AMDGPU::V_MAC_F16_e64:
4292 case AMDGPU::V_FMAC_F16_e64:
4293 case AMDGPU::V_FMAC_F16_t16_e64:
4294 case AMDGPU::V_FMAC_F16_fake16_e64:
4295 case AMDGPU::V_MAC_F32_e64:
4296 case AMDGPU::V_MAC_LEGACY_F32_e64:
4297 case AMDGPU::V_FMAC_F32_e64:
4298 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4299 case AMDGPU::V_FMAC_F64_e64:
4300 break;
4301 case AMDGPU::V_MAC_F16_e32:
4302 case AMDGPU::V_FMAC_F16_e32:
4303 case AMDGPU::V_MAC_F32_e32:
4304 case AMDGPU::V_MAC_LEGACY_F32_e32:
4305 case AMDGPU::V_FMAC_F32_e32:
4306 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4307 case AMDGPU::V_FMAC_F64_e32: {
4308 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4309 AMDGPU::OpName::src0);
4310 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4311 if (!Src0->isReg() && !Src0->isImm())
4312 return nullptr;
4313
4314 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4315 Src0Literal = true;
4316
4317 break;
4318 }
4319 }
4320
4321 MachineInstrBuilder MIB;
4322 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4323 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4324 const MachineOperand *Src0Mods =
4325 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4326 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4327 const MachineOperand *Src1Mods =
4328 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4329 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4330 const MachineOperand *Src2Mods =
4331 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4332 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4333 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4334 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4335
4336 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4337 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4338 // If we have an SGPR input, we will violate the constant bus restriction.
4339 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4340 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4341 MachineInstr *DefMI;
4342
4343 int64_t Imm;
4344 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4345 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4346 if (pseudoToMCOpcode(NewOpc) != -1) {
4347 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4348 .add(*Dst)
4349 .add(*Src0)
4350 .add(*Src1)
4351 .addImm(Imm)
4352 .setMIFlags(MI.getFlags());
4353 U.RemoveMIUse = DefMI;
4354 return MIB;
4355 }
4356 }
4357 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4358 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4359 if (pseudoToMCOpcode(NewOpc) != -1) {
4360 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4361 .add(*Dst)
4362 .add(*Src0)
4363 .addImm(Imm)
4364 .add(*Src2)
4365 .setMIFlags(MI.getFlags());
4366 U.RemoveMIUse = DefMI;
4367 return MIB;
4368 }
4369 }
4370 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4371 if (Src0Literal) {
4372 Imm = Src0->getImm();
4373 DefMI = nullptr;
4374 }
4375 if (pseudoToMCOpcode(NewOpc) != -1 &&
4377 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4378 Src1)) {
4379 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4380 .add(*Dst)
4381 .add(*Src1)
4382 .addImm(Imm)
4383 .add(*Src2)
4384 .setMIFlags(MI.getFlags());
4385 U.RemoveMIUse = DefMI;
4386 return MIB;
4387 }
4388 }
4389 }
4390
4391 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4392 // if VOP3 does not allow a literal operand.
4393 if (Src0Literal && !ST.hasVOP3Literal())
4394 return nullptr;
4395
4396 unsigned NewOpc = getNewFMAInst(ST, Opc);
4397
4398 if (pseudoToMCOpcode(NewOpc) == -1)
4399 return nullptr;
4400
4401 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4402 .add(*Dst)
4403 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4404 .add(*Src0)
4405 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4406 .add(*Src1)
4407 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4408 .add(*Src2)
4409 .addImm(Clamp ? Clamp->getImm() : 0)
4410 .addImm(Omod ? Omod->getImm() : 0)
4411 .setMIFlags(MI.getFlags());
4412 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4413 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4414 return MIB;
4415}
4416
4417// It's not generally safe to move VALU instructions across these since it will
4418// start using the register as a base index rather than directly.
4419// XXX - Why isn't hasSideEffects sufficient for these?
4421 switch (MI.getOpcode()) {
4422 case AMDGPU::S_SET_GPR_IDX_ON:
4423 case AMDGPU::S_SET_GPR_IDX_MODE:
4424 case AMDGPU::S_SET_GPR_IDX_OFF:
4425 return true;
4426 default:
4427 return false;
4428 }
4429}
4430
4432 const MachineBasicBlock *MBB,
4433 const MachineFunction &MF) const {
4434 // Skipping the check for SP writes in the base implementation. The reason it
4435 // was added was apparently due to compile time concerns.
4436 //
4437 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4438 // but is probably avoidable.
4439
4440 // Copied from base implementation.
4441 // Terminators and labels can't be scheduled around.
4442 if (MI.isTerminator() || MI.isPosition())
4443 return true;
4444
4445 // INLINEASM_BR can jump to another block
4446 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4447 return true;
4448
4449 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4450 return true;
4451
4452 // Target-independent instructions do not have an implicit-use of EXEC, even
4453 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4454 // boundaries prevents incorrect movements of such instructions.
4455 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4456 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4457 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4458 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4459 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4461}
4462
4464 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4465 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4466 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4467}
4468
4470 // Instructions that access scratch use FLAT encoding or BUF encodings.
4471 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4472 return false;
4473
4474 // If scratch is not initialized, we can never access it.
4475 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4476 return false;
4477
4478 // SCRATCH instructions always access scratch.
4479 if (isFLATScratch(MI))
4480 return true;
4481
4482 // If there are no memory operands then conservatively assume the flat
4483 // operation may access scratch.
4484 if (MI.memoperands_empty())
4485 return true;
4486
4487 // See if any memory operand specifies an address space that involves scratch.
4488 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4489 unsigned AS = Memop->getAddrSpace();
4490 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4491 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4492 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4493 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4494 }
4495 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4496 });
4497}
4498
4500 assert(isFLAT(MI));
4501
4502 // All flat instructions use the VMEM counter except prefetch.
4503 if (!usesVM_CNT(MI))
4504 return false;
4505
4506 // If there are no memory operands then conservatively assume the flat
4507 // operation may access VMEM.
4508 if (MI.memoperands_empty())
4509 return true;
4510
4511 // See if any memory operand specifies an address space that involves VMEM.
4512 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4513 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4514 // (GDS) address space is not supported by flat operations. Therefore, simply
4515 // return true unless only the LDS address space is found.
4516 for (const MachineMemOperand *Memop : MI.memoperands()) {
4517 unsigned AS = Memop->getAddrSpace();
4519 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4520 return true;
4521 }
4522
4523 return false;
4524}
4525
4527 assert(isFLAT(MI));
4528
4529 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4530 if (!usesLGKM_CNT(MI))
4531 return false;
4532
4533 // If in tgsplit mode then there can be no use of LDS.
4534 if (ST.isTgSplitEnabled())
4535 return false;
4536
4537 // If there are no memory operands then conservatively assume the flat
4538 // operation may access LDS.
4539 if (MI.memoperands_empty())
4540 return true;
4541
4542 // See if any memory operand specifies an address space that involves LDS.
4543 for (const MachineMemOperand *Memop : MI.memoperands()) {
4544 unsigned AS = Memop->getAddrSpace();
4546 return true;
4547 }
4548
4549 return false;
4550}
4551
4553 // Skip the full operand and register alias search modifiesRegister
4554 // does. There's only a handful of instructions that touch this, it's only an
4555 // implicit def, and doesn't alias any other registers.
4556 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4557}
4558
4560 unsigned Opcode = MI.getOpcode();
4561
4562 if (MI.mayStore() && isSMRD(MI))
4563 return true; // scalar store or atomic
4564
4565 // This will terminate the function when other lanes may need to continue.
4566 if (MI.isReturn())
4567 return true;
4568
4569 // These instructions cause shader I/O that may cause hardware lockups
4570 // when executed with an empty EXEC mask.
4571 //
4572 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4573 // EXEC = 0, but checking for that case here seems not worth it
4574 // given the typical code patterns.
4575 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4576 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4577 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4578 return true;
4579
4580 if (MI.isCall() || MI.isInlineAsm())
4581 return true; // conservative assumption
4582
4583 // Assume that barrier interactions are only intended with active lanes.
4584 if (isBarrier(Opcode))
4585 return true;
4586
4587 // A mode change is a scalar operation that influences vector instructions.
4589 return true;
4590
4591 // These are like SALU instructions in terms of effects, so it's questionable
4592 // whether we should return true for those.
4593 //
4594 // However, executing them with EXEC = 0 causes them to operate on undefined
4595 // data, which we avoid by returning true here.
4596 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4597 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4598 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4599 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4600 return true;
4601
4602 return false;
4603}
4604
4606 const MachineInstr &MI) const {
4607 if (MI.isMetaInstruction())
4608 return false;
4609
4610 // This won't read exec if this is an SGPR->SGPR copy.
4611 if (MI.isCopyLike()) {
4612 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4613 return true;
4614
4615 // Make sure this isn't copying exec as a normal operand
4616 return MI.readsRegister(AMDGPU::EXEC, &RI);
4617 }
4618
4619 // Make a conservative assumption about the callee.
4620 if (MI.isCall())
4621 return true;
4622
4623 // Be conservative with any unhandled generic opcodes.
4624 if (!isTargetSpecificOpcode(MI.getOpcode()))
4625 return true;
4626
4627 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4628}
4629
4630bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4631 switch (Imm.getBitWidth()) {
4632 case 1: // This likely will be a condition code mask.
4633 return true;
4634
4635 case 32:
4636 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4637 ST.hasInv2PiInlineImm());
4638 case 64:
4639 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4640 ST.hasInv2PiInlineImm());
4641 case 16:
4642 return ST.has16BitInsts() &&
4643 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4644 ST.hasInv2PiInlineImm());
4645 default:
4646 llvm_unreachable("invalid bitwidth");
4647 }
4648}
4649
4651 APInt IntImm = Imm.bitcastToAPInt();
4652 int64_t IntImmVal = IntImm.getSExtValue();
4653 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4654 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4655 default:
4656 llvm_unreachable("invalid fltSemantics");
4659 return isInlineConstant(IntImm);
4661 return ST.has16BitInsts() &&
4662 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4664 return ST.has16BitInsts() &&
4665 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4666 }
4667}
4668
4669bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4670 // MachineOperand provides no way to tell the true operand size, since it only
4671 // records a 64-bit value. We need to know the size to determine if a 32-bit
4672 // floating point immediate bit pattern is legal for an integer immediate. It
4673 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4674 switch (OperandType) {
4684 int32_t Trunc = static_cast<int32_t>(Imm);
4685 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4686 }
4692 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4695 // We would expect inline immediates to not be concerned with an integer/fp
4696 // distinction. However, in the case of 16-bit integer operations, the
4697 // "floating point" values appear to not work. It seems read the low 16-bits
4698 // of 32-bit immediates, which happens to always work for the integer
4699 // values.
4700 //
4701 // See llvm bugzilla 46302.
4702 //
4703 // TODO: Theoretically we could use op-sel to use the high bits of the
4704 // 32-bit FP values.
4716 return false;
4719 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4720 // A few special case instructions have 16-bit operands on subtargets
4721 // where 16-bit instructions are not legal.
4722 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4723 // constants in these cases
4724 int16_t Trunc = static_cast<int16_t>(Imm);
4725 return ST.has16BitInsts() &&
4726 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4727 }
4728
4729 return false;
4730 }
4733 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4734 int16_t Trunc = static_cast<int16_t>(Imm);
4735 return ST.has16BitInsts() &&
4736 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4737 }
4738 return false;
4739 }
4743 return false;
4745 return isLegalAV64PseudoImm(Imm);
4748 // Always embedded in the instruction for free.
4749 return true;
4759 // Just ignore anything else.
4760 return true;
4761 default:
4762 llvm_unreachable("invalid operand type");
4763 }
4764}
4765
4766static bool compareMachineOp(const MachineOperand &Op0,
4767 const MachineOperand &Op1) {
4768 if (Op0.getType() != Op1.getType())
4769 return false;
4770
4771 switch (Op0.getType()) {
4773 return Op0.getReg() == Op1.getReg();
4775 return Op0.getImm() == Op1.getImm();
4776 default:
4777 llvm_unreachable("Didn't expect to be comparing these operand types");
4778 }
4779}
4780
4782 const MCOperandInfo &OpInfo) const {
4783 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4784 return true;
4785
4786 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4787 return false;
4788
4789 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4790 return true;
4791
4792 return ST.hasVOP3Literal();
4793}
4794
4795bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4796 int64_t ImmVal) const {
4797 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4798 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4799 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4800 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4801 AMDGPU::OpName::src2))
4802 return false;
4803 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4804 }
4805
4806 return isLiteralOperandLegal(InstDesc, OpInfo);
4807}
4808
4809bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4810 const MachineOperand &MO) const {
4811 if (MO.isImm())
4812 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4813
4814 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4815 "unexpected imm-like operand kind");
4816 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4817 return isLiteralOperandLegal(InstDesc, OpInfo);
4818}
4819
4821 // 2 32-bit inline constants packed into one.
4822 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4823 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4824}
4825
4826bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4827 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4828 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4829 return false;
4830
4831 int Op32 = AMDGPU::getVOPe32(Opcode);
4832 if (Op32 == -1)
4833 return false;
4834
4835 return pseudoToMCOpcode(Op32) != -1;
4836}
4837
4838bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4839 // The src0_modifier operand is present on all instructions
4840 // that have modifiers.
4841
4842 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4843}
4844
4846 AMDGPU::OpName OpName) const {
4847 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4848 return Mods && Mods->getImm();
4849}
4850
4852 return any_of(ModifierOpNames,
4853 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4854}
4855
4857 const MachineRegisterInfo &MRI) const {
4858 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4859 // Can't shrink instruction with three operands.
4860 if (Src2) {
4861 switch (MI.getOpcode()) {
4862 default: return false;
4863
4864 case AMDGPU::V_ADDC_U32_e64:
4865 case AMDGPU::V_SUBB_U32_e64:
4866 case AMDGPU::V_SUBBREV_U32_e64: {
4867 const MachineOperand *Src1
4868 = getNamedOperand(MI, AMDGPU::OpName::src1);
4869 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4870 return false;
4871 // Additional verification is needed for sdst/src2.
4872 return true;
4873 }
4874 case AMDGPU::V_MAC_F16_e64:
4875 case AMDGPU::V_MAC_F32_e64:
4876 case AMDGPU::V_MAC_LEGACY_F32_e64:
4877 case AMDGPU::V_FMAC_F16_e64:
4878 case AMDGPU::V_FMAC_F16_t16_e64:
4879 case AMDGPU::V_FMAC_F16_fake16_e64:
4880 case AMDGPU::V_FMAC_F32_e64:
4881 case AMDGPU::V_FMAC_F64_e64:
4882 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4883 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4884 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4885 return false;
4886 break;
4887
4888 case AMDGPU::V_CNDMASK_B32_e64:
4889 break;
4890 }
4891 }
4892
4893 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4894 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4895 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4896 return false;
4897
4898 // We don't need to check src0, all input types are legal, so just make sure
4899 // src0 isn't using any modifiers.
4900 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4901 return false;
4902
4903 // Can it be shrunk to a valid 32 bit opcode?
4904 if (!hasVALU32BitEncoding(MI.getOpcode()))
4905 return false;
4906
4907 // Check output modifiers
4908 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4909 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4910 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4911 // TODO: Can we avoid checking bound_ctrl/fi here?
4912 // They are only used by permlane*_swap special case.
4913 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4914 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4915}
4916
4917// Set VCC operand with all flags from \p Orig, except for setting it as
4918// implicit.
4920 const MachineOperand &Orig) {
4921
4922 for (MachineOperand &Use : MI.implicit_operands()) {
4923 if (Use.isUse() &&
4924 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4925 Use.setIsUndef(Orig.isUndef());
4926 Use.setIsKill(Orig.isKill());
4927 return;
4928 }
4929 }
4930}
4931
4933 unsigned Op32) const {
4934 MachineBasicBlock *MBB = MI.getParent();
4935
4936 const MCInstrDesc &Op32Desc = get(Op32);
4937 MachineInstrBuilder Inst32 =
4938 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4939 .setMIFlags(MI.getFlags());
4940
4941 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4942 // For VOPC instructions, this is replaced by an implicit def of vcc.
4943
4944 // We assume the defs of the shrunk opcode are in the same order, and the
4945 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4946 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4947 Inst32.add(MI.getOperand(I));
4948
4949 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4950
4951 int Idx = MI.getNumExplicitDefs();
4952 for (const MachineOperand &Use : MI.explicit_uses()) {
4953 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4955 continue;
4956
4957 if (&Use == Src2) {
4958 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4959 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4960 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4961 // of vcc was already added during the initial BuildMI, but we
4962 // 1) may need to change vcc to vcc_lo to preserve the original register
4963 // 2) have to preserve the original flags.
4964 copyFlagsToImplicitVCC(*Inst32, *Src2);
4965 continue;
4966 }
4967 }
4968
4969 Inst32.add(Use);
4970 }
4971
4972 // FIXME: Losing implicit operands
4973 fixImplicitOperands(*Inst32);
4974 return Inst32;
4975}
4976
4978 // Null is free
4979 Register Reg = RegOp.getReg();
4980 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4981 return false;
4982
4983 // SGPRs use the constant bus
4984
4985 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4986 // physical register operands should also count, except for exec.
4987 if (RegOp.isImplicit())
4988 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4989
4990 // SGPRs use the constant bus
4991 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4992 AMDGPU::SReg_64RegClass.contains(Reg);
4993}
4994
4996 const MachineRegisterInfo &MRI) const {
4997 Register Reg = RegOp.getReg();
4998 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4999 : physRegUsesConstantBus(RegOp);
5000}
5001
5003 const MachineOperand &MO,
5004 const MCOperandInfo &OpInfo) const {
5005 // Literal constants use the constant bus.
5006 if (!MO.isReg())
5007 return !isInlineConstant(MO, OpInfo);
5008
5009 Register Reg = MO.getReg();
5010 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5012}
5013
5015 for (const MachineOperand &MO : MI.implicit_operands()) {
5016 // We only care about reads.
5017 if (MO.isDef())
5018 continue;
5019
5020 switch (MO.getReg()) {
5021 case AMDGPU::VCC:
5022 case AMDGPU::VCC_LO:
5023 case AMDGPU::VCC_HI:
5024 case AMDGPU::M0:
5025 case AMDGPU::FLAT_SCR:
5026 return MO.getReg();
5027
5028 default:
5029 break;
5030 }
5031 }
5032
5033 return Register();
5034}
5035
5036static bool shouldReadExec(const MachineInstr &MI) {
5037 if (SIInstrInfo::isVALU(MI)) {
5038 switch (MI.getOpcode()) {
5039 case AMDGPU::V_READLANE_B32:
5040 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5041 case AMDGPU::V_WRITELANE_B32:
5042 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5043 return false;
5044 }
5045
5046 return true;
5047 }
5048
5049 if (MI.isPreISelOpcode() ||
5050 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5053 return false;
5054
5055 return true;
5056}
5057
5058static bool isRegOrFI(const MachineOperand &MO) {
5059 return MO.isReg() || MO.isFI();
5060}
5061
5062static bool isSubRegOf(const SIRegisterInfo &TRI,
5063 const MachineOperand &SuperVec,
5064 const MachineOperand &SubReg) {
5065 if (SubReg.getReg().isPhysical())
5066 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5067
5068 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5069 SubReg.getReg() == SuperVec.getReg();
5070}
5071
5072// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5073bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5074 const MachineRegisterInfo &MRI,
5075 StringRef &ErrInfo) const {
5076 Register DstReg = MI.getOperand(0).getReg();
5077 Register SrcReg = MI.getOperand(1).getReg();
5078 // This is a check for copy from vector register to SGPR
5079 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5080 ErrInfo = "illegal copy from vector register to SGPR";
5081 return false;
5082 }
5083 return true;
5084}
5085
5087 StringRef &ErrInfo) const {
5088 uint16_t Opcode = MI.getOpcode();
5089 const MachineFunction *MF = MI.getMF();
5090 const MachineRegisterInfo &MRI = MF->getRegInfo();
5091
5092 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5093 // Find a better property to recognize the point where instruction selection
5094 // is just done.
5095 // We can only enforce this check after SIFixSGPRCopies pass so that the
5096 // illegal copies are legalized and thereafter we don't expect a pass
5097 // inserting similar copies.
5098 if (!MRI.isSSA() && MI.isCopy())
5099 return verifyCopy(MI, MRI, ErrInfo);
5100
5101 if (SIInstrInfo::isGenericOpcode(Opcode))
5102 return true;
5103
5104 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5105 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5106 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5107 int Src3Idx = -1;
5108 if (Src0Idx == -1) {
5109 // VOPD V_DUAL_* instructions use different operand names.
5110 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5111 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5112 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5113 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5114 }
5115
5116 // Make sure the number of operands is correct.
5117 const MCInstrDesc &Desc = get(Opcode);
5118 if (!Desc.isVariadic() &&
5119 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5120 ErrInfo = "Instruction has wrong number of operands.";
5121 return false;
5122 }
5123
5124 if (MI.isInlineAsm()) {
5125 // Verify register classes for inlineasm constraints.
5126 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5127 I != E; ++I) {
5128 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5129 if (!RC)
5130 continue;
5131
5132 const MachineOperand &Op = MI.getOperand(I);
5133 if (!Op.isReg())
5134 continue;
5135
5136 Register Reg = Op.getReg();
5137 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5138 ErrInfo = "inlineasm operand has incorrect register class.";
5139 return false;
5140 }
5141 }
5142
5143 return true;
5144 }
5145
5146 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5147 ErrInfo = "missing memory operand from image instruction.";
5148 return false;
5149 }
5150
5151 // Make sure the register classes are correct.
5152 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5153 const MachineOperand &MO = MI.getOperand(i);
5154 if (MO.isFPImm()) {
5155 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5156 "all fp values to integers.";
5157 return false;
5158 }
5159
5160 const MCOperandInfo &OpInfo = Desc.operands()[i];
5161 int16_t RegClass = getOpRegClassID(OpInfo);
5162
5163 switch (OpInfo.OperandType) {
5165 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5166 ErrInfo = "Illegal immediate value for operand.";
5167 return false;
5168 }
5169 break;
5182 break;
5184 break;
5185 break;
5199 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5200 ErrInfo = "Illegal immediate value for operand.";
5201 return false;
5202 }
5203 break;
5204 }
5206 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5207 ErrInfo = "Expected inline constant for operand.";
5208 return false;
5209 }
5210 break;
5214 break;
5219 // Check if this operand is an immediate.
5220 // FrameIndex operands will be replaced by immediates, so they are
5221 // allowed.
5222 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5223 ErrInfo = "Expected immediate, but got non-immediate";
5224 return false;
5225 }
5226 break;
5230 break;
5231 default:
5232 if (OpInfo.isGenericType())
5233 continue;
5234 break;
5235 }
5236
5237 if (!MO.isReg())
5238 continue;
5239 Register Reg = MO.getReg();
5240 if (!Reg)
5241 continue;
5242
5243 // FIXME: Ideally we would have separate instruction definitions with the
5244 // aligned register constraint.
5245 // FIXME: We do not verify inline asm operands, but custom inline asm
5246 // verification is broken anyway
5247 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5248 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5249 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5250 if (const TargetRegisterClass *SubRC =
5251 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5252 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5253 if (RC)
5254 RC = SubRC;
5255 }
5256 }
5257
5258 // Check that this is the aligned version of the class.
5259 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5260 ErrInfo = "Subtarget requires even aligned vector registers";
5261 return false;
5262 }
5263 }
5264
5265 if (RegClass != -1) {
5266 if (Reg.isVirtual())
5267 continue;
5268
5269 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5270 if (!RC->contains(Reg)) {
5271 ErrInfo = "Operand has incorrect register class.";
5272 return false;
5273 }
5274 }
5275 }
5276
5277 // Verify SDWA
5278 if (isSDWA(MI)) {
5279 if (!ST.hasSDWA()) {
5280 ErrInfo = "SDWA is not supported on this target";
5281 return false;
5282 }
5283
5284 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5285 AMDGPU::OpName::dst_sel}) {
5286 const MachineOperand *MO = getNamedOperand(MI, Op);
5287 if (!MO)
5288 continue;
5289 int64_t Imm = MO->getImm();
5290 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5291 ErrInfo = "Invalid SDWA selection";
5292 return false;
5293 }
5294 }
5295
5296 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5297
5298 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5299 if (OpIdx == -1)
5300 continue;
5301 const MachineOperand &MO = MI.getOperand(OpIdx);
5302
5303 if (!ST.hasSDWAScalar()) {
5304 // Only VGPRS on VI
5305 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5306 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5307 return false;
5308 }
5309 } else {
5310 // No immediates on GFX9
5311 if (!MO.isReg()) {
5312 ErrInfo =
5313 "Only reg allowed as operands in SDWA instructions on GFX9+";
5314 return false;
5315 }
5316 }
5317 }
5318
5319 if (!ST.hasSDWAOmod()) {
5320 // No omod allowed on VI
5321 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5322 if (OMod != nullptr &&
5323 (!OMod->isImm() || OMod->getImm() != 0)) {
5324 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5325 return false;
5326 }
5327 }
5328
5329 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5330 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5331 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5332 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5333 const MachineOperand *Src0ModsMO =
5334 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5335 unsigned Mods = Src0ModsMO->getImm();
5336 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5337 Mods & SISrcMods::SEXT) {
5338 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5339 return false;
5340 }
5341 }
5342
5343 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5344 if (isVOPC(BasicOpcode)) {
5345 if (!ST.hasSDWASdst() && DstIdx != -1) {
5346 // Only vcc allowed as dst on VI for VOPC
5347 const MachineOperand &Dst = MI.getOperand(DstIdx);
5348 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5349 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5350 return false;
5351 }
5352 } else if (!ST.hasSDWAOutModsVOPC()) {
5353 // No clamp allowed on GFX9 for VOPC
5354 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5355 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5356 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5357 return false;
5358 }
5359
5360 // No omod allowed on GFX9 for VOPC
5361 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5362 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5363 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5364 return false;
5365 }
5366 }
5367 }
5368
5369 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5370 if (DstUnused && DstUnused->isImm() &&
5371 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5372 const MachineOperand &Dst = MI.getOperand(DstIdx);
5373 if (!Dst.isReg() || !Dst.isTied()) {
5374 ErrInfo = "Dst register should have tied register";
5375 return false;
5376 }
5377
5378 const MachineOperand &TiedMO =
5379 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5380 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5381 ErrInfo =
5382 "Dst register should be tied to implicit use of preserved register";
5383 return false;
5384 }
5385 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5386 ErrInfo = "Dst register should use same physical register as preserved";
5387 return false;
5388 }
5389 }
5390 }
5391
5392 // Verify MIMG / VIMAGE / VSAMPLE
5393 if (isImage(Opcode) && !MI.mayStore()) {
5394 // Ensure that the return type used is large enough for all the options
5395 // being used TFE/LWE require an extra result register.
5396 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5397 if (DMask) {
5398 uint64_t DMaskImm = DMask->getImm();
5399 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5400 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5401 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5402 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5403
5404 // Adjust for packed 16 bit values
5405 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5406 RegCount = divideCeil(RegCount, 2);
5407
5408 // Adjust if using LWE or TFE
5409 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5410 RegCount += 1;
5411
5412 const uint32_t DstIdx =
5413 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5414 const MachineOperand &Dst = MI.getOperand(DstIdx);
5415 if (Dst.isReg()) {
5416 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5417 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5418 if (RegCount > DstSize) {
5419 ErrInfo = "Image instruction returns too many registers for dst "
5420 "register class";
5421 return false;
5422 }
5423 }
5424 }
5425 }
5426
5427 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5428 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5429 unsigned ConstantBusCount = 0;
5430 bool UsesLiteral = false;
5431 const MachineOperand *LiteralVal = nullptr;
5432
5433 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5434 if (ImmIdx != -1) {
5435 ++ConstantBusCount;
5436 UsesLiteral = true;
5437 LiteralVal = &MI.getOperand(ImmIdx);
5438 }
5439
5440 SmallVector<Register, 2> SGPRsUsed;
5441 Register SGPRUsed;
5442
5443 // Only look at the true operands. Only a real operand can use the constant
5444 // bus, and we don't want to check pseudo-operands like the source modifier
5445 // flags.
5446 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5447 if (OpIdx == -1)
5448 continue;
5449 const MachineOperand &MO = MI.getOperand(OpIdx);
5450 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5451 if (MO.isReg()) {
5452 SGPRUsed = MO.getReg();
5453 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5454 ++ConstantBusCount;
5455 SGPRsUsed.push_back(SGPRUsed);
5456 }
5457 } else if (!MO.isFI()) { // Treat FI like a register.
5458 if (!UsesLiteral) {
5459 ++ConstantBusCount;
5460 UsesLiteral = true;
5461 LiteralVal = &MO;
5462 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5463 assert(isVOP2(MI) || isVOP3(MI));
5464 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5465 return false;
5466 }
5467 }
5468 }
5469 }
5470
5471 SGPRUsed = findImplicitSGPRRead(MI);
5472 if (SGPRUsed) {
5473 // Implicit uses may safely overlap true operands
5474 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5475 return !RI.regsOverlap(SGPRUsed, SGPR);
5476 })) {
5477 ++ConstantBusCount;
5478 SGPRsUsed.push_back(SGPRUsed);
5479 }
5480 }
5481
5482 // v_writelane_b32 is an exception from constant bus restriction:
5483 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5484 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5485 Opcode != AMDGPU::V_WRITELANE_B32) {
5486 ErrInfo = "VOP* instruction violates constant bus restriction";
5487 return false;
5488 }
5489
5490 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5491 ErrInfo = "VOP3 instruction uses literal";
5492 return false;
5493 }
5494 }
5495
5496 // Special case for writelane - this can break the multiple constant bus rule,
5497 // but still can't use more than one SGPR register
5498 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5499 unsigned SGPRCount = 0;
5500 Register SGPRUsed;
5501
5502 for (int OpIdx : {Src0Idx, Src1Idx}) {
5503 if (OpIdx == -1)
5504 break;
5505
5506 const MachineOperand &MO = MI.getOperand(OpIdx);
5507
5508 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5509 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5510 if (MO.getReg() != SGPRUsed)
5511 ++SGPRCount;
5512 SGPRUsed = MO.getReg();
5513 }
5514 }
5515 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5516 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5517 return false;
5518 }
5519 }
5520 }
5521
5522 // Verify misc. restrictions on specific instructions.
5523 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5524 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5525 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5526 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5527 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5528 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5529 if (!compareMachineOp(Src0, Src1) &&
5530 !compareMachineOp(Src0, Src2)) {
5531 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5532 return false;
5533 }
5534 }
5535 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5536 SISrcMods::ABS) ||
5537 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5538 SISrcMods::ABS) ||
5539 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5540 SISrcMods::ABS)) {
5541 ErrInfo = "ABS not allowed in VOP3B instructions";
5542 return false;
5543 }
5544 }
5545
5546 if (isSOP2(MI) || isSOPC(MI)) {
5547 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5548 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5549
5550 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5551 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5552 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5553 !Src0.isIdenticalTo(Src1)) {
5554 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5555 return false;
5556 }
5557 }
5558
5559 if (isSOPK(MI)) {
5560 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5561 if (Desc.isBranch()) {
5562 if (!Op->isMBB()) {
5563 ErrInfo = "invalid branch target for SOPK instruction";
5564 return false;
5565 }
5566 } else {
5567 uint64_t Imm = Op->getImm();
5568 if (sopkIsZext(Opcode)) {
5569 if (!isUInt<16>(Imm)) {
5570 ErrInfo = "invalid immediate for SOPK instruction";
5571 return false;
5572 }
5573 } else {
5574 if (!isInt<16>(Imm)) {
5575 ErrInfo = "invalid immediate for SOPK instruction";
5576 return false;
5577 }
5578 }
5579 }
5580 }
5581
5582 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5583 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5584 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5585 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5586 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5587 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5588
5589 const unsigned StaticNumOps =
5590 Desc.getNumOperands() + Desc.implicit_uses().size();
5591 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5592
5593 // Require additional implicit operands. This allows a fixup done by the
5594 // post RA scheduler where the main implicit operand is killed and
5595 // implicit-defs are added for sub-registers that remain live after this
5596 // instruction.
5597 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5598 ErrInfo = "missing implicit register operands";
5599 return false;
5600 }
5601
5602 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5603 if (IsDst) {
5604 if (!Dst->isUse()) {
5605 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5606 return false;
5607 }
5608
5609 unsigned UseOpIdx;
5610 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5611 UseOpIdx != StaticNumOps + 1) {
5612 ErrInfo = "movrel implicit operands should be tied";
5613 return false;
5614 }
5615 }
5616
5617 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5618 const MachineOperand &ImpUse
5619 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5620 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5621 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5622 ErrInfo = "src0 should be subreg of implicit vector use";
5623 return false;
5624 }
5625 }
5626
5627 // Make sure we aren't losing exec uses in the td files. This mostly requires
5628 // being careful when using let Uses to try to add other use registers.
5629 if (shouldReadExec(MI)) {
5630 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5631 ErrInfo = "VALU instruction does not implicitly read exec mask";
5632 return false;
5633 }
5634 }
5635
5636 if (isSMRD(MI)) {
5637 if (MI.mayStore() &&
5638 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5639 // The register offset form of scalar stores may only use m0 as the
5640 // soffset register.
5641 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5642 if (Soff && Soff->getReg() != AMDGPU::M0) {
5643 ErrInfo = "scalar stores must use m0 as offset register";
5644 return false;
5645 }
5646 }
5647 }
5648
5649 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5650 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5651 if (Offset->getImm() != 0) {
5652 ErrInfo = "subtarget does not support offsets in flat instructions";
5653 return false;
5654 }
5655 }
5656
5657 if (isDS(MI) && !ST.hasGDS()) {
5658 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5659 if (GDSOp && GDSOp->getImm() != 0) {
5660 ErrInfo = "GDS is not supported on this subtarget";
5661 return false;
5662 }
5663 }
5664
5665 if (isImage(MI)) {
5666 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5667 if (DimOp) {
5668 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5669 AMDGPU::OpName::vaddr0);
5670 AMDGPU::OpName RSrcOpName =
5671 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5672 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5673 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5674 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5675 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5676 const AMDGPU::MIMGDimInfo *Dim =
5678
5679 if (!Dim) {
5680 ErrInfo = "dim is out of range";
5681 return false;
5682 }
5683
5684 bool IsA16 = false;
5685 if (ST.hasR128A16()) {
5686 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5687 IsA16 = R128A16->getImm() != 0;
5688 } else if (ST.hasA16()) {
5689 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5690 IsA16 = A16->getImm() != 0;
5691 }
5692
5693 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5694
5695 unsigned AddrWords =
5696 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5697
5698 unsigned VAddrWords;
5699 if (IsNSA) {
5700 VAddrWords = RsrcIdx - VAddr0Idx;
5701 if (ST.hasPartialNSAEncoding() &&
5702 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5703 unsigned LastVAddrIdx = RsrcIdx - 1;
5704 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5705 }
5706 } else {
5707 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5708 if (AddrWords > 12)
5709 AddrWords = 16;
5710 }
5711
5712 if (VAddrWords != AddrWords) {
5713 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5714 << " but got " << VAddrWords << "\n");
5715 ErrInfo = "bad vaddr size";
5716 return false;
5717 }
5718 }
5719 }
5720
5721 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5722 if (DppCt) {
5723 using namespace AMDGPU::DPP;
5724
5725 unsigned DC = DppCt->getImm();
5726 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5727 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5728 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5729 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5730 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5731 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5732 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5733 ErrInfo = "Invalid dpp_ctrl value";
5734 return false;
5735 }
5736 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5737 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5738 ErrInfo = "Invalid dpp_ctrl value: "
5739 "wavefront shifts are not supported on GFX10+";
5740 return false;
5741 }
5742 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5743 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5744 ErrInfo = "Invalid dpp_ctrl value: "
5745 "broadcasts are not supported on GFX10+";
5746 return false;
5747 }
5748 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5749 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5750 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5751 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5752 !ST.hasGFX90AInsts()) {
5753 ErrInfo = "Invalid dpp_ctrl value: "
5754 "row_newbroadcast/row_share is not supported before "
5755 "GFX90A/GFX10";
5756 return false;
5757 }
5758 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5759 ErrInfo = "Invalid dpp_ctrl value: "
5760 "row_share and row_xmask are not supported before GFX10";
5761 return false;
5762 }
5763 }
5764
5765 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5767 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5768 ErrInfo = "Invalid dpp_ctrl value: "
5769 "DP ALU dpp only support row_newbcast";
5770 return false;
5771 }
5772 }
5773
5774 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5775 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5776 AMDGPU::OpName DataName =
5777 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5778 const MachineOperand *Data = getNamedOperand(MI, DataName);
5779 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5780 if (Data && !Data->isReg())
5781 Data = nullptr;
5782
5783 if (ST.hasGFX90AInsts()) {
5784 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5785 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5786 ErrInfo = "Invalid register class: "
5787 "vdata and vdst should be both VGPR or AGPR";
5788 return false;
5789 }
5790 if (Data && Data2 &&
5791 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5792 ErrInfo = "Invalid register class: "
5793 "both data operands should be VGPR or AGPR";
5794 return false;
5795 }
5796 } else {
5797 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5798 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5799 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5800 ErrInfo = "Invalid register class: "
5801 "agpr loads and stores not supported on this GPU";
5802 return false;
5803 }
5804 }
5805 }
5806
5807 if (ST.needsAlignedVGPRs()) {
5808 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5810 if (!Op)
5811 return true;
5812 Register Reg = Op->getReg();
5813 if (Reg.isPhysical())
5814 return !(RI.getHWRegIndex(Reg) & 1);
5815 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5816 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5817 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5818 };
5819
5820 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5821 Opcode == AMDGPU::DS_GWS_BARRIER) {
5822
5823 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5824 ErrInfo = "Subtarget requires even aligned vector registers "
5825 "for DS_GWS instructions";
5826 return false;
5827 }
5828 }
5829
5830 if (isMIMG(MI)) {
5831 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5832 ErrInfo = "Subtarget requires even aligned vector registers "
5833 "for vaddr operand of image instructions";
5834 return false;
5835 }
5836 }
5837 }
5838
5839 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5840 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5841 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5842 ErrInfo = "Invalid register class: "
5843 "v_accvgpr_write with an SGPR is not supported on this GPU";
5844 return false;
5845 }
5846 }
5847
5848 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5849 const MachineOperand &SrcOp = MI.getOperand(1);
5850 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5851 ErrInfo = "pseudo expects only physical SGPRs";
5852 return false;
5853 }
5854 }
5855
5856 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5857 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5858 if (!ST.hasScaleOffset()) {
5859 ErrInfo = "Subtarget does not support offset scaling";
5860 return false;
5861 }
5862 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5863 ErrInfo = "Instruction does not support offset scaling";
5864 return false;
5865 }
5866 }
5867 }
5868
5869 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5870 // information.
5871 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5872 for (unsigned I = 0; I < 3; ++I) {
5874 return false;
5875 }
5876 }
5877
5878 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5879 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5880 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5881 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5882 &AMDGPU::SReg_64RegClass) ||
5883 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5884 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5885 return false;
5886 }
5887 }
5888
5889 return true;
5890}
5891
5892// It is more readable to list mapped opcodes on the same line.
5893// clang-format off
5894
5896 switch (MI.getOpcode()) {
5897 default: return AMDGPU::INSTRUCTION_LIST_END;
5898 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5899 case AMDGPU::COPY: return AMDGPU::COPY;
5900 case AMDGPU::PHI: return AMDGPU::PHI;
5901 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5902 case AMDGPU::WQM: return AMDGPU::WQM;
5903 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5904 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5905 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5906 case AMDGPU::S_MOV_B32: {
5907 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5908 return MI.getOperand(1).isReg() ||
5909 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5910 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5911 }
5912 case AMDGPU::S_ADD_I32:
5913 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5914 case AMDGPU::S_ADDC_U32:
5915 return AMDGPU::V_ADDC_U32_e32;
5916 case AMDGPU::S_SUB_I32:
5917 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5918 // FIXME: These are not consistently handled, and selected when the carry is
5919 // used.
5920 case AMDGPU::S_ADD_U32:
5921 return AMDGPU::V_ADD_CO_U32_e32;
5922 case AMDGPU::S_SUB_U32:
5923 return AMDGPU::V_SUB_CO_U32_e32;
5924 case AMDGPU::S_ADD_U64_PSEUDO:
5925 return AMDGPU::V_ADD_U64_PSEUDO;
5926 case AMDGPU::S_SUB_U64_PSEUDO:
5927 return AMDGPU::V_SUB_U64_PSEUDO;
5928 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5929 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5930 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5931 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5932 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5933 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5934 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5935 case AMDGPU::S_XNOR_B32:
5936 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5937 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5938 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5939 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5940 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5941 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5942 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5943 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5944 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5945 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5946 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5947 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5948 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5949 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5950 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5951 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5952 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5953 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5954 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5955 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5956 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5957 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5958 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5959 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5960 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5961 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5962 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5963 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5964 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5965 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5966 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5967 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5968 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5969 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5970 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5971 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5972 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5973 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5974 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5975 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5976 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5977 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5978 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5979 case AMDGPU::S_CVT_F32_F16:
5980 case AMDGPU::S_CVT_HI_F32_F16:
5981 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5982 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5983 case AMDGPU::S_CVT_F16_F32:
5984 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5985 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5986 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5987 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5988 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5989 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5990 case AMDGPU::S_CEIL_F16:
5991 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5992 : AMDGPU::V_CEIL_F16_fake16_e64;
5993 case AMDGPU::S_FLOOR_F16:
5994 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5995 : AMDGPU::V_FLOOR_F16_fake16_e64;
5996 case AMDGPU::S_TRUNC_F16:
5997 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5998 : AMDGPU::V_TRUNC_F16_fake16_e64;
5999 case AMDGPU::S_RNDNE_F16:
6000 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6001 : AMDGPU::V_RNDNE_F16_fake16_e64;
6002 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6003 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6004 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6005 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6006 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6007 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6008 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6009 case AMDGPU::S_ADD_F16:
6010 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6011 : AMDGPU::V_ADD_F16_fake16_e64;
6012 case AMDGPU::S_SUB_F16:
6013 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6014 : AMDGPU::V_SUB_F16_fake16_e64;
6015 case AMDGPU::S_MIN_F16:
6016 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6017 : AMDGPU::V_MIN_F16_fake16_e64;
6018 case AMDGPU::S_MAX_F16:
6019 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6020 : AMDGPU::V_MAX_F16_fake16_e64;
6021 case AMDGPU::S_MINIMUM_F16:
6022 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6023 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6024 case AMDGPU::S_MAXIMUM_F16:
6025 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6026 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6027 case AMDGPU::S_MUL_F16:
6028 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6029 : AMDGPU::V_MUL_F16_fake16_e64;
6030 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6031 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6032 case AMDGPU::S_FMAC_F16:
6033 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6034 : AMDGPU::V_FMAC_F16_fake16_e64;
6035 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6036 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6037 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6038 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6039 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6040 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6041 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6042 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6043 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6044 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6045 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6046 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6047 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6048 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6049 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6050 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6051 case AMDGPU::S_CMP_LT_F16:
6052 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6053 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6054 case AMDGPU::S_CMP_EQ_F16:
6055 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6056 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6057 case AMDGPU::S_CMP_LE_F16:
6058 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6059 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6060 case AMDGPU::S_CMP_GT_F16:
6061 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6062 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6063 case AMDGPU::S_CMP_LG_F16:
6064 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6065 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6066 case AMDGPU::S_CMP_GE_F16:
6067 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6068 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6069 case AMDGPU::S_CMP_O_F16:
6070 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6071 : AMDGPU::V_CMP_O_F16_fake16_e64;
6072 case AMDGPU::S_CMP_U_F16:
6073 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6074 : AMDGPU::V_CMP_U_F16_fake16_e64;
6075 case AMDGPU::S_CMP_NGE_F16:
6076 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6077 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6078 case AMDGPU::S_CMP_NLG_F16:
6079 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6080 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6081 case AMDGPU::S_CMP_NGT_F16:
6082 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6083 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6084 case AMDGPU::S_CMP_NLE_F16:
6085 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6086 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6087 case AMDGPU::S_CMP_NEQ_F16:
6088 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6089 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6090 case AMDGPU::S_CMP_NLT_F16:
6091 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6092 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6093 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6094 case AMDGPU::V_S_EXP_F16_e64:
6095 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6096 : AMDGPU::V_EXP_F16_fake16_e64;
6097 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6098 case AMDGPU::V_S_LOG_F16_e64:
6099 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6100 : AMDGPU::V_LOG_F16_fake16_e64;
6101 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6102 case AMDGPU::V_S_RCP_F16_e64:
6103 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6104 : AMDGPU::V_RCP_F16_fake16_e64;
6105 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6106 case AMDGPU::V_S_RSQ_F16_e64:
6107 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6108 : AMDGPU::V_RSQ_F16_fake16_e64;
6109 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6110 case AMDGPU::V_S_SQRT_F16_e64:
6111 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6112 : AMDGPU::V_SQRT_F16_fake16_e64;
6113 }
6115 "Unexpected scalar opcode without corresponding vector one!");
6116}
6117
6118// clang-format on
6119
6123 const DebugLoc &DL, Register Reg,
6124 bool IsSCCLive,
6125 SlotIndexes *Indexes) const {
6126 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6127 const SIInstrInfo *TII = ST.getInstrInfo();
6129 if (IsSCCLive) {
6130 // Insert two move instructions, one to save the original value of EXEC and
6131 // the other to turn on all bits in EXEC. This is required as we can't use
6132 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6133 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6135 auto FlipExecMI =
6136 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6137 if (Indexes) {
6138 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6139 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6140 }
6141 } else {
6142 auto SaveExec =
6143 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6144 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6145 if (Indexes)
6146 Indexes->insertMachineInstrInMaps(*SaveExec);
6147 }
6148}
6149
6152 const DebugLoc &DL, Register Reg,
6153 SlotIndexes *Indexes) const {
6155 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6156 .addReg(Reg, RegState::Kill);
6157 if (Indexes)
6158 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6159}
6160
6164 "Not a whole wave func");
6165 MachineBasicBlock &MBB = *MF.begin();
6166 for (MachineInstr &MI : MBB)
6167 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6168 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6169 return &MI;
6170
6171 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6172}
6173
6175 unsigned OpNo) const {
6176 const MCInstrDesc &Desc = get(MI.getOpcode());
6177 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6178 Desc.operands()[OpNo].RegClass == -1) {
6179 Register Reg = MI.getOperand(OpNo).getReg();
6180
6181 if (Reg.isVirtual()) {
6182 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6183 return MRI.getRegClass(Reg);
6184 }
6185 return RI.getPhysRegBaseClass(Reg);
6186 }
6187
6188 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6189 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6190}
6191
6194 MachineBasicBlock *MBB = MI.getParent();
6195 MachineOperand &MO = MI.getOperand(OpIdx);
6196 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6197 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6198 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6199 unsigned Size = RI.getRegSizeInBits(*RC);
6200 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6201 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6202 : AMDGPU::V_MOV_B32_e32;
6203 if (MO.isReg())
6204 Opcode = AMDGPU::COPY;
6205 else if (RI.isSGPRClass(RC))
6206 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6207
6208 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6209 Register Reg = MRI.createVirtualRegister(VRC);
6210 DebugLoc DL = MBB->findDebugLoc(I);
6211 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6212 MO.ChangeToRegister(Reg, false);
6213}
6214
6217 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6218 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6219 if (!SuperReg.getReg().isVirtual())
6220 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6221
6222 MachineBasicBlock *MBB = MI->getParent();
6223 const DebugLoc &DL = MI->getDebugLoc();
6224 Register SubReg = MRI.createVirtualRegister(SubRC);
6225
6226 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6227 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6228 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6229 return SubReg;
6230}
6231
6234 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6235 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6236 if (Op.isImm()) {
6237 if (SubIdx == AMDGPU::sub0)
6238 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6239 if (SubIdx == AMDGPU::sub1)
6240 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6241
6242 llvm_unreachable("Unhandled register index for immediate");
6243 }
6244
6245 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6246 SubIdx, SubRC);
6247 return MachineOperand::CreateReg(SubReg, false);
6248}
6249
6250// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6251void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6252 assert(Inst.getNumExplicitOperands() == 3);
6253 MachineOperand Op1 = Inst.getOperand(1);
6254 Inst.removeOperand(1);
6255 Inst.addOperand(Op1);
6256}
6257
6259 const MCOperandInfo &OpInfo,
6260 const MachineOperand &MO) const {
6261 if (!MO.isReg())
6262 return false;
6263
6264 Register Reg = MO.getReg();
6265
6266 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6267 if (Reg.isPhysical())
6268 return DRC->contains(Reg);
6269
6270 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6271
6272 if (MO.getSubReg()) {
6273 const MachineFunction *MF = MO.getParent()->getMF();
6274 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6275 if (!SuperRC)
6276 return false;
6277 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6278 }
6279
6280 return RI.getCommonSubClass(DRC, RC) != nullptr;
6281}
6282
6284 const MachineOperand &MO) const {
6285 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6286 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6287 unsigned Opc = MI.getOpcode();
6288
6289 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6290 // information.
6291 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6292 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6293 constexpr AMDGPU::OpName OpNames[] = {
6294 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6295
6296 for (auto [I, OpName] : enumerate(OpNames)) {
6297 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6298 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6300 return false;
6301 }
6302 }
6303
6304 if (!isLegalRegOperand(MRI, OpInfo, MO))
6305 return false;
6306
6307 // check Accumulate GPR operand
6308 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6309 if (IsAGPR && !ST.hasMAIInsts())
6310 return false;
6311 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6312 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6313 return false;
6314 // Atomics should have both vdst and vdata either vgpr or agpr.
6315 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6316 const int DataIdx = AMDGPU::getNamedOperandIdx(
6317 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6318 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6319 MI.getOperand(DataIdx).isReg() &&
6320 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6321 return false;
6322 if ((int)OpIdx == DataIdx) {
6323 if (VDstIdx != -1 &&
6324 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6325 return false;
6326 // DS instructions with 2 src operands also must have tied RC.
6327 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6328 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6329 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6330 return false;
6331 }
6332
6333 // Check V_ACCVGPR_WRITE_B32_e64
6334 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6335 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6336 RI.isSGPRReg(MRI, MO.getReg()))
6337 return false;
6338
6339 if (ST.hasFlatScratchHiInB64InstHazard() &&
6340 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6341 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6342 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6343 64)
6344 return false;
6345 }
6346 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6347 return false;
6348 }
6349
6350 return true;
6351}
6352
6354 const MCOperandInfo &OpInfo,
6355 const MachineOperand &MO) const {
6356 if (MO.isReg())
6357 return isLegalRegOperand(MRI, OpInfo, MO);
6358
6359 // Handle non-register types that are treated like immediates.
6360 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6361 return true;
6362}
6363
6365 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6366 const MachineOperand *MO) const {
6367 constexpr unsigned NumOps = 3;
6368 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6369 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6370 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6371 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6372
6373 assert(SrcN < NumOps);
6374
6375 if (!MO) {
6376 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6377 if (SrcIdx == -1)
6378 return true;
6379 MO = &MI.getOperand(SrcIdx);
6380 }
6381
6382 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6383 return true;
6384
6385 int ModsIdx =
6386 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6387 if (ModsIdx == -1)
6388 return true;
6389
6390 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6391 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6392 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6393
6394 return !OpSel && !OpSelHi;
6395}
6396
6398 const MachineOperand *MO) const {
6399 const MachineFunction &MF = *MI.getMF();
6400 const MachineRegisterInfo &MRI = MF.getRegInfo();
6401 const MCInstrDesc &InstDesc = MI.getDesc();
6402 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6403 int64_t RegClass = getOpRegClassID(OpInfo);
6404 const TargetRegisterClass *DefinedRC =
6405 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6406 if (!MO)
6407 MO = &MI.getOperand(OpIdx);
6408
6409 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6410
6411 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6412 const MachineOperand *UsedLiteral = nullptr;
6413
6414 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6415 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6416
6417 // TODO: Be more permissive with frame indexes.
6418 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6419 if (!LiteralLimit--)
6420 return false;
6421
6422 UsedLiteral = MO;
6423 }
6424
6426 if (MO->isReg())
6427 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6428
6429 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6430 if (i == OpIdx)
6431 continue;
6432 const MachineOperand &Op = MI.getOperand(i);
6433 if (Op.isReg()) {
6434 if (Op.isUse()) {
6435 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6436 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6437 if (--ConstantBusLimit <= 0)
6438 return false;
6439 }
6440 }
6441 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6442 !isInlineConstant(Op, InstDesc.operands()[i])) {
6443 // The same literal may be used multiple times.
6444 if (!UsedLiteral)
6445 UsedLiteral = &Op;
6446 else if (UsedLiteral->isIdenticalTo(Op))
6447 continue;
6448
6449 if (!LiteralLimit--)
6450 return false;
6451 if (--ConstantBusLimit <= 0)
6452 return false;
6453 }
6454 }
6455 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6456 // There can be at most one literal operand, but it can be repeated.
6457 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6458 if (i == OpIdx)
6459 continue;
6460 const MachineOperand &Op = MI.getOperand(i);
6461 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6462 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6463 !Op.isIdenticalTo(*MO))
6464 return false;
6465
6466 // Do not fold a non-inlineable and non-register operand into an
6467 // instruction that already has a frame index. The frame index handling
6468 // code could not handle well when a frame index co-exists with another
6469 // non-register operand, unless that operand is an inlineable immediate.
6470 if (Op.isFI())
6471 return false;
6472 }
6473 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6474 isF16PseudoScalarTrans(MI.getOpcode())) {
6475 return false;
6476 }
6477
6478 if (MO->isReg()) {
6479 if (!DefinedRC)
6480 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6481 return isLegalRegOperand(MI, OpIdx, *MO);
6482 }
6483
6484 if (MO->isImm()) {
6485 uint64_t Imm = MO->getImm();
6486 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6487 bool Is64BitOp = Is64BitFPOp ||
6488 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6489 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6490 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6491 if (Is64BitOp &&
6492 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6493 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6494 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6495 return false;
6496
6497 // FIXME: We can use sign extended 64-bit literals, but only for signed
6498 // operands. At the moment we do not know if an operand is signed.
6499 // Such operand will be encoded as its low 32 bits and then either
6500 // correctly sign extended or incorrectly zero extended by HW.
6501 // If 64-bit literals are supported and the literal will be encoded
6502 // as full 64 bit we still can use it.
6503 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6504 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6505 return false;
6506 }
6507 }
6508
6509 // Handle non-register types that are treated like immediates.
6510 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6511
6512 if (!DefinedRC) {
6513 // This operand expects an immediate.
6514 return true;
6515 }
6516
6517 return isImmOperandLegal(MI, OpIdx, *MO);
6518}
6519
6521 bool IsGFX950Only = ST.hasGFX950Insts();
6522 bool IsGFX940Only = ST.hasGFX940Insts();
6523
6524 if (!IsGFX950Only && !IsGFX940Only)
6525 return false;
6526
6527 if (!isVALU(MI))
6528 return false;
6529
6530 // V_COS, V_EXP, V_RCP, etc.
6531 if (isTRANS(MI))
6532 return true;
6533
6534 // DOT2, DOT2C, DOT4, etc.
6535 if (isDOT(MI))
6536 return true;
6537
6538 // MFMA, SMFMA
6539 if (isMFMA(MI))
6540 return true;
6541
6542 unsigned Opcode = MI.getOpcode();
6543 switch (Opcode) {
6544 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6545 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6546 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6547 case AMDGPU::V_MQSAD_U32_U8_e64:
6548 case AMDGPU::V_PK_ADD_F16:
6549 case AMDGPU::V_PK_ADD_F32:
6550 case AMDGPU::V_PK_ADD_I16:
6551 case AMDGPU::V_PK_ADD_U16:
6552 case AMDGPU::V_PK_ASHRREV_I16:
6553 case AMDGPU::V_PK_FMA_F16:
6554 case AMDGPU::V_PK_FMA_F32:
6555 case AMDGPU::V_PK_FMAC_F16_e32:
6556 case AMDGPU::V_PK_FMAC_F16_e64:
6557 case AMDGPU::V_PK_LSHLREV_B16:
6558 case AMDGPU::V_PK_LSHRREV_B16:
6559 case AMDGPU::V_PK_MAD_I16:
6560 case AMDGPU::V_PK_MAD_U16:
6561 case AMDGPU::V_PK_MAX_F16:
6562 case AMDGPU::V_PK_MAX_I16:
6563 case AMDGPU::V_PK_MAX_U16:
6564 case AMDGPU::V_PK_MIN_F16:
6565 case AMDGPU::V_PK_MIN_I16:
6566 case AMDGPU::V_PK_MIN_U16:
6567 case AMDGPU::V_PK_MOV_B32:
6568 case AMDGPU::V_PK_MUL_F16:
6569 case AMDGPU::V_PK_MUL_F32:
6570 case AMDGPU::V_PK_MUL_LO_U16:
6571 case AMDGPU::V_PK_SUB_I16:
6572 case AMDGPU::V_PK_SUB_U16:
6573 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6574 return true;
6575 default:
6576 return false;
6577 }
6578}
6579
6581 MachineInstr &MI) const {
6582 unsigned Opc = MI.getOpcode();
6583 const MCInstrDesc &InstrDesc = get(Opc);
6584
6585 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6586 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6587
6588 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6589 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6590
6591 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6592 // we need to only have one constant bus use before GFX10.
6593 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6594 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6595 RI.isSGPRReg(MRI, Src0.getReg()))
6596 legalizeOpWithMove(MI, Src0Idx);
6597
6598 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6599 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6600 // src0/src1 with V_READFIRSTLANE.
6601 if (Opc == AMDGPU::V_WRITELANE_B32) {
6602 const DebugLoc &DL = MI.getDebugLoc();
6603 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6604 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6605 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6606 .add(Src0);
6607 Src0.ChangeToRegister(Reg, false);
6608 }
6609 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6610 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6611 const DebugLoc &DL = MI.getDebugLoc();
6612 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6613 .add(Src1);
6614 Src1.ChangeToRegister(Reg, false);
6615 }
6616 return;
6617 }
6618
6619 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6620 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6621 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6622 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6623 legalizeOpWithMove(MI, Src2Idx);
6624 }
6625
6626 // VOP2 src0 instructions support all operand types, so we don't need to check
6627 // their legality. If src1 is already legal, we don't need to do anything.
6628 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6629 return;
6630
6631 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6632 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6633 // select is uniform.
6634 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6635 RI.isVGPR(MRI, Src1.getReg())) {
6636 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6637 const DebugLoc &DL = MI.getDebugLoc();
6638 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6639 .add(Src1);
6640 Src1.ChangeToRegister(Reg, false);
6641 return;
6642 }
6643
6644 // We do not use commuteInstruction here because it is too aggressive and will
6645 // commute if it is possible. We only want to commute here if it improves
6646 // legality. This can be called a fairly large number of times so don't waste
6647 // compile time pointlessly swapping and checking legality again.
6648 if (HasImplicitSGPR || !MI.isCommutable()) {
6649 legalizeOpWithMove(MI, Src1Idx);
6650 return;
6651 }
6652
6653 // If src0 can be used as src1, commuting will make the operands legal.
6654 // Otherwise we have to give up and insert a move.
6655 //
6656 // TODO: Other immediate-like operand kinds could be commuted if there was a
6657 // MachineOperand::ChangeTo* for them.
6658 if ((!Src1.isImm() && !Src1.isReg()) ||
6659 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6660 legalizeOpWithMove(MI, Src1Idx);
6661 return;
6662 }
6663
6664 int CommutedOpc = commuteOpcode(MI);
6665 if (CommutedOpc == -1) {
6666 legalizeOpWithMove(MI, Src1Idx);
6667 return;
6668 }
6669
6670 MI.setDesc(get(CommutedOpc));
6671
6672 Register Src0Reg = Src0.getReg();
6673 unsigned Src0SubReg = Src0.getSubReg();
6674 bool Src0Kill = Src0.isKill();
6675
6676 if (Src1.isImm())
6677 Src0.ChangeToImmediate(Src1.getImm());
6678 else if (Src1.isReg()) {
6679 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6680 Src0.setSubReg(Src1.getSubReg());
6681 } else
6682 llvm_unreachable("Should only have register or immediate operands");
6683
6684 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6685 Src1.setSubReg(Src0SubReg);
6687}
6688
6689// Legalize VOP3 operands. All operand types are supported for any operand
6690// but only one literal constant and only starting from GFX10.
6692 MachineInstr &MI) const {
6693 unsigned Opc = MI.getOpcode();
6694
6695 int VOP3Idx[3] = {
6696 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6697 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6698 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6699 };
6700
6701 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6702 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6703 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6704 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6705 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6706 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6707 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6708 // src1 and src2 must be scalar
6709 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6710 const DebugLoc &DL = MI.getDebugLoc();
6711 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6712 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6713 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6714 .add(Src1);
6715 Src1.ChangeToRegister(Reg, false);
6716 }
6717 if (VOP3Idx[2] != -1) {
6718 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6719 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6720 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6721 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6722 .add(Src2);
6723 Src2.ChangeToRegister(Reg, false);
6724 }
6725 }
6726 }
6727
6728 // Find the one SGPR operand we are allowed to use.
6729 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6730 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6731 SmallDenseSet<unsigned> SGPRsUsed;
6732 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6733 if (SGPRReg) {
6734 SGPRsUsed.insert(SGPRReg);
6735 --ConstantBusLimit;
6736 }
6737
6738 for (int Idx : VOP3Idx) {
6739 if (Idx == -1)
6740 break;
6741 MachineOperand &MO = MI.getOperand(Idx);
6742
6743 if (!MO.isReg()) {
6744 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6745 continue;
6746
6747 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6748 --LiteralLimit;
6749 --ConstantBusLimit;
6750 continue;
6751 }
6752
6753 --LiteralLimit;
6754 --ConstantBusLimit;
6755 legalizeOpWithMove(MI, Idx);
6756 continue;
6757 }
6758
6759 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6760 continue; // VGPRs are legal
6761
6762 // We can use one SGPR in each VOP3 instruction prior to GFX10
6763 // and two starting from GFX10.
6764 if (SGPRsUsed.count(MO.getReg()))
6765 continue;
6766 if (ConstantBusLimit > 0) {
6767 SGPRsUsed.insert(MO.getReg());
6768 --ConstantBusLimit;
6769 continue;
6770 }
6771
6772 // If we make it this far, then the operand is not legal and we must
6773 // legalize it.
6774 legalizeOpWithMove(MI, Idx);
6775 }
6776
6777 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6778 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6779 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6780 legalizeOpWithMove(MI, VOP3Idx[2]);
6781
6782 // Fix the register class of packed FP32 instructions on gfx12+. See
6783 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6785 for (unsigned I = 0; I < 3; ++I) {
6787 legalizeOpWithMove(MI, VOP3Idx[I]);
6788 }
6789 }
6790}
6791
6794 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6795 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6796 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6797 if (DstRC)
6798 SRC = RI.getCommonSubClass(SRC, DstRC);
6799
6800 Register DstReg = MRI.createVirtualRegister(SRC);
6801 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6802
6803 if (RI.hasAGPRs(VRC)) {
6804 VRC = RI.getEquivalentVGPRClass(VRC);
6805 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6806 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6807 get(TargetOpcode::COPY), NewSrcReg)
6808 .addReg(SrcReg);
6809 SrcReg = NewSrcReg;
6810 }
6811
6812 if (SubRegs == 1) {
6813 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6814 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6815 .addReg(SrcReg);
6816 return DstReg;
6817 }
6818
6820 for (unsigned i = 0; i < SubRegs; ++i) {
6821 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6822 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6823 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6824 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6825 SRegs.push_back(SGPR);
6826 }
6827
6829 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6830 get(AMDGPU::REG_SEQUENCE), DstReg);
6831 for (unsigned i = 0; i < SubRegs; ++i) {
6832 MIB.addReg(SRegs[i]);
6833 MIB.addImm(RI.getSubRegFromChannel(i));
6834 }
6835 return DstReg;
6836}
6837
6839 MachineInstr &MI) const {
6840
6841 // If the pointer is store in VGPRs, then we need to move them to
6842 // SGPRs using v_readfirstlane. This is safe because we only select
6843 // loads with uniform pointers to SMRD instruction so we know the
6844 // pointer value is uniform.
6845 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6846 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6847 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6848 SBase->setReg(SGPR);
6849 }
6850 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6851 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6852 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6853 SOff->setReg(SGPR);
6854 }
6855}
6856
6858 unsigned Opc = Inst.getOpcode();
6859 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6860 if (OldSAddrIdx < 0)
6861 return false;
6862
6863 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6864
6865 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6866 if (NewOpc < 0)
6868 if (NewOpc < 0)
6869 return false;
6870
6872 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6873 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6874 return false;
6875
6876 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6877 if (NewVAddrIdx < 0)
6878 return false;
6879
6880 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6881
6882 // Check vaddr, it shall be zero or absent.
6883 MachineInstr *VAddrDef = nullptr;
6884 if (OldVAddrIdx >= 0) {
6885 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6886 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6887 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6888 !VAddrDef->getOperand(1).isImm() ||
6889 VAddrDef->getOperand(1).getImm() != 0)
6890 return false;
6891 }
6892
6893 const MCInstrDesc &NewDesc = get(NewOpc);
6894 Inst.setDesc(NewDesc);
6895
6896 // Callers expect iterator to be valid after this call, so modify the
6897 // instruction in place.
6898 if (OldVAddrIdx == NewVAddrIdx) {
6899 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6900 // Clear use list from the old vaddr holding a zero register.
6901 MRI.removeRegOperandFromUseList(&NewVAddr);
6902 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6903 Inst.removeOperand(OldSAddrIdx);
6904 // Update the use list with the pointer we have just moved from vaddr to
6905 // saddr position. Otherwise new vaddr will be missing from the use list.
6906 MRI.removeRegOperandFromUseList(&NewVAddr);
6907 MRI.addRegOperandToUseList(&NewVAddr);
6908 } else {
6909 assert(OldSAddrIdx == NewVAddrIdx);
6910
6911 if (OldVAddrIdx >= 0) {
6912 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6913 AMDGPU::OpName::vdst_in);
6914
6915 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6916 // it asserts. Untie the operands for now and retie them afterwards.
6917 if (NewVDstIn != -1) {
6918 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6919 Inst.untieRegOperand(OldVDstIn);
6920 }
6921
6922 Inst.removeOperand(OldVAddrIdx);
6923
6924 if (NewVDstIn != -1) {
6925 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6926 Inst.tieOperands(NewVDst, NewVDstIn);
6927 }
6928 }
6929 }
6930
6931 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6932 VAddrDef->eraseFromParent();
6933
6934 return true;
6935}
6936
6937// FIXME: Remove this when SelectionDAG is obsoleted.
6939 MachineInstr &MI) const {
6940 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6941 return;
6942
6943 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6944 // thinks they are uniform, so a readfirstlane should be valid.
6945 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6946 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6947 return;
6948
6950 return;
6951
6952 const TargetRegisterClass *DeclaredRC =
6953 getRegClass(MI.getDesc(), SAddr->getOperandNo());
6954
6955 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6956 SAddr->setReg(ToSGPR);
6957}
6958
6961 const TargetRegisterClass *DstRC,
6964 const DebugLoc &DL) const {
6965 Register OpReg = Op.getReg();
6966 unsigned OpSubReg = Op.getSubReg();
6967
6968 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6969 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6970
6971 // Check if operand is already the correct register class.
6972 if (DstRC == OpRC)
6973 return;
6974
6975 Register DstReg = MRI.createVirtualRegister(DstRC);
6976 auto Copy =
6977 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6978 Op.setReg(DstReg);
6979
6980 MachineInstr *Def = MRI.getVRegDef(OpReg);
6981 if (!Def)
6982 return;
6983
6984 // Try to eliminate the copy if it is copying an immediate value.
6985 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6986 foldImmediate(*Copy, *Def, OpReg, &MRI);
6987
6988 bool ImpDef = Def->isImplicitDef();
6989 while (!ImpDef && Def && Def->isCopy()) {
6990 if (Def->getOperand(1).getReg().isPhysical())
6991 break;
6992 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6993 ImpDef = Def && Def->isImplicitDef();
6994 }
6995 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6996 !ImpDef)
6997 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6998}
6999
7000// Emit the actual waterfall loop, executing the wrapped instruction for each
7001// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7002// iteration, in the worst case we execute 64 (once per lane).
7003static void
7006 MachineBasicBlock &LoopBB,
7007 MachineBasicBlock &BodyBB,
7008 const DebugLoc &DL,
7009 ArrayRef<MachineOperand *> ScalarOps) {
7010 MachineFunction &MF = *LoopBB.getParent();
7011 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7012 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7014 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7015
7017 Register CondReg;
7018
7019 for (MachineOperand *ScalarOp : ScalarOps) {
7020 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7021 unsigned NumSubRegs = RegSize / 32;
7022 Register VScalarOp = ScalarOp->getReg();
7023
7024 if (NumSubRegs == 1) {
7025 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7026
7027 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7028 .addReg(VScalarOp);
7029
7030 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7031
7032 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7033 .addReg(CurReg)
7034 .addReg(VScalarOp);
7035
7036 // Combine the comparison results with AND.
7037 if (!CondReg) // First.
7038 CondReg = NewCondReg;
7039 else { // If not the first, we create an AND.
7040 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7041 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7042 .addReg(CondReg)
7043 .addReg(NewCondReg);
7044 CondReg = AndReg;
7045 }
7046
7047 // Update ScalarOp operand to use the SGPR ScalarOp.
7048 ScalarOp->setReg(CurReg);
7049 ScalarOp->setIsKill();
7050 } else {
7051 SmallVector<Register, 8> ReadlanePieces;
7052 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7053 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7054 "Unhandled register size");
7055
7056 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7057 Register CurRegLo =
7058 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7059 Register CurRegHi =
7060 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7061
7062 // Read the next variant <- also loop target.
7063 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7064 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7065
7066 // Read the next variant <- also loop target.
7067 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7068 .addReg(VScalarOp, VScalarOpUndef,
7069 TRI->getSubRegFromChannel(Idx + 1));
7070
7071 ReadlanePieces.push_back(CurRegLo);
7072 ReadlanePieces.push_back(CurRegHi);
7073
7074 // Comparison is to be done as 64-bit.
7075 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7076 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7077 .addReg(CurRegLo)
7078 .addImm(AMDGPU::sub0)
7079 .addReg(CurRegHi)
7080 .addImm(AMDGPU::sub1);
7081
7082 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7083 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7084 NewCondReg)
7085 .addReg(CurReg);
7086 if (NumSubRegs <= 2)
7087 Cmp.addReg(VScalarOp);
7088 else
7089 Cmp.addReg(VScalarOp, VScalarOpUndef,
7090 TRI->getSubRegFromChannel(Idx, 2));
7091
7092 // Combine the comparison results with AND.
7093 if (!CondReg) // First.
7094 CondReg = NewCondReg;
7095 else { // If not the first, we create an AND.
7096 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7097 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7098 .addReg(CondReg)
7099 .addReg(NewCondReg);
7100 CondReg = AndReg;
7101 }
7102 } // End for loop.
7103
7104 const auto *SScalarOpRC =
7105 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7106 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7107
7108 // Build scalar ScalarOp.
7109 auto Merge =
7110 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7111 unsigned Channel = 0;
7112 for (Register Piece : ReadlanePieces) {
7113 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7114 }
7115
7116 // Update ScalarOp operand to use the SGPR ScalarOp.
7117 ScalarOp->setReg(SScalarOp);
7118 ScalarOp->setIsKill();
7119 }
7120 }
7121
7122 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7123 MRI.setSimpleHint(SaveExec, CondReg);
7124
7125 // Update EXEC to matching lanes, saving original to SaveExec.
7126 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7127 .addReg(CondReg, RegState::Kill);
7128
7129 // The original instruction is here; we insert the terminators after it.
7130 I = BodyBB.end();
7131
7132 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7133 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7134 .addReg(LMC.ExecReg)
7135 .addReg(SaveExec);
7136
7137 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7138}
7139
7140// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7141// with SGPRs by iterating over all unique values across all lanes.
7142// Returns the loop basic block that now contains \p MI.
7143static MachineBasicBlock *
7147 MachineBasicBlock::iterator Begin = nullptr,
7148 MachineBasicBlock::iterator End = nullptr) {
7149 MachineBasicBlock &MBB = *MI.getParent();
7150 MachineFunction &MF = *MBB.getParent();
7151 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7152 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7154 if (!Begin.isValid())
7155 Begin = &MI;
7156 if (!End.isValid()) {
7157 End = &MI;
7158 ++End;
7159 }
7160 const DebugLoc &DL = MI.getDebugLoc();
7162 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7163
7164 // Save SCC. Waterfall Loop may overwrite SCC.
7165 Register SaveSCCReg;
7166
7167 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7168 // rather than unlimited scan everywhere
7169 bool SCCNotDead =
7170 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7171 std::numeric_limits<unsigned>::max()) !=
7173 if (SCCNotDead) {
7174 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7175 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7176 .addImm(1)
7177 .addImm(0);
7178 }
7179
7180 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7181
7182 // Save the EXEC mask
7183 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7184
7185 // Killed uses in the instruction we are waterfalling around will be
7186 // incorrect due to the added control-flow.
7188 ++AfterMI;
7189 for (auto I = Begin; I != AfterMI; I++) {
7190 for (auto &MO : I->all_uses())
7191 MRI.clearKillFlags(MO.getReg());
7192 }
7193
7194 // To insert the loop we need to split the block. Move everything after this
7195 // point to a new block, and insert a new empty block between the two.
7198 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7200 ++MBBI;
7201
7202 MF.insert(MBBI, LoopBB);
7203 MF.insert(MBBI, BodyBB);
7204 MF.insert(MBBI, RemainderBB);
7205
7206 LoopBB->addSuccessor(BodyBB);
7207 BodyBB->addSuccessor(LoopBB);
7208 BodyBB->addSuccessor(RemainderBB);
7209
7210 // Move Begin to MI to the BodyBB, and the remainder of the block to
7211 // RemainderBB.
7212 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7213 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7214 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7215
7216 MBB.addSuccessor(LoopBB);
7217
7218 // Update dominators. We know that MBB immediately dominates LoopBB, that
7219 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7220 // RemainderBB. RemainderBB immediately dominates all of the successors
7221 // transferred to it from MBB that MBB used to properly dominate.
7222 if (MDT) {
7223 MDT->addNewBlock(LoopBB, &MBB);
7224 MDT->addNewBlock(BodyBB, LoopBB);
7225 MDT->addNewBlock(RemainderBB, BodyBB);
7226 for (auto &Succ : RemainderBB->successors()) {
7227 if (MDT->properlyDominates(&MBB, Succ)) {
7228 MDT->changeImmediateDominator(Succ, RemainderBB);
7229 }
7230 }
7231 }
7232
7233 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7234
7235 MachineBasicBlock::iterator First = RemainderBB->begin();
7236 // Restore SCC
7237 if (SCCNotDead) {
7238 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7239 .addReg(SaveSCCReg, RegState::Kill)
7240 .addImm(0);
7241 }
7242
7243 // Restore the EXEC mask
7244 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7245 .addReg(SaveExec);
7246 return BodyBB;
7247}
7248
7249// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7250static std::tuple<unsigned, unsigned>
7252 MachineBasicBlock &MBB = *MI.getParent();
7253 MachineFunction &MF = *MBB.getParent();
7255
7256 // Extract the ptr from the resource descriptor.
7257 unsigned RsrcPtr =
7258 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7259 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7260
7261 // Create an empty resource descriptor
7262 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7263 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7264 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7265 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7266 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7267
7268 // Zero64 = 0
7269 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7270 .addImm(0);
7271
7272 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7273 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7274 .addImm(Lo_32(RsrcDataFormat));
7275
7276 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7277 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7278 .addImm(Hi_32(RsrcDataFormat));
7279
7280 // NewSRsrc = {Zero64, SRsrcFormat}
7281 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7282 .addReg(Zero64)
7283 .addImm(AMDGPU::sub0_sub1)
7284 .addReg(SRsrcFormatLo)
7285 .addImm(AMDGPU::sub2)
7286 .addReg(SRsrcFormatHi)
7287 .addImm(AMDGPU::sub3);
7288
7289 return std::tuple(RsrcPtr, NewSRsrc);
7290}
7291
7294 MachineDominatorTree *MDT) const {
7295 MachineFunction &MF = *MI.getMF();
7297 MachineBasicBlock *CreatedBB = nullptr;
7298
7299 // Legalize VOP2
7300 if (isVOP2(MI) || isVOPC(MI)) {
7302 return CreatedBB;
7303 }
7304
7305 // Legalize VOP3
7306 if (isVOP3(MI)) {
7308 return CreatedBB;
7309 }
7310
7311 // Legalize SMRD
7312 if (isSMRD(MI)) {
7314 return CreatedBB;
7315 }
7316
7317 // Legalize FLAT
7318 if (isFLAT(MI)) {
7320 return CreatedBB;
7321 }
7322
7323 // Legalize REG_SEQUENCE and PHI
7324 // The register class of the operands much be the same type as the register
7325 // class of the output.
7326 if (MI.getOpcode() == AMDGPU::PHI) {
7327 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7328 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7329 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7330 continue;
7331 const TargetRegisterClass *OpRC =
7332 MRI.getRegClass(MI.getOperand(i).getReg());
7333 if (RI.hasVectorRegisters(OpRC)) {
7334 VRC = OpRC;
7335 } else {
7336 SRC = OpRC;
7337 }
7338 }
7339
7340 // If any of the operands are VGPR registers, then they all most be
7341 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7342 // them.
7343 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7344 if (!VRC) {
7345 assert(SRC);
7346 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7347 VRC = &AMDGPU::VReg_1RegClass;
7348 } else
7349 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7350 ? RI.getEquivalentAGPRClass(SRC)
7351 : RI.getEquivalentVGPRClass(SRC);
7352 } else {
7353 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7354 ? RI.getEquivalentAGPRClass(VRC)
7355 : RI.getEquivalentVGPRClass(VRC);
7356 }
7357 RC = VRC;
7358 } else {
7359 RC = SRC;
7360 }
7361
7362 // Update all the operands so they have the same type.
7363 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7364 MachineOperand &Op = MI.getOperand(I);
7365 if (!Op.isReg() || !Op.getReg().isVirtual())
7366 continue;
7367
7368 // MI is a PHI instruction.
7369 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7371
7372 // Avoid creating no-op copies with the same src and dst reg class. These
7373 // confuse some of the machine passes.
7374 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7375 }
7376 }
7377
7378 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7379 // VGPR dest type and SGPR sources, insert copies so all operands are
7380 // VGPRs. This seems to help operand folding / the register coalescer.
7381 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7382 MachineBasicBlock *MBB = MI.getParent();
7383 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7384 if (RI.hasVGPRs(DstRC)) {
7385 // Update all the operands so they are VGPR register classes. These may
7386 // not be the same register class because REG_SEQUENCE supports mixing
7387 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7388 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7389 MachineOperand &Op = MI.getOperand(I);
7390 if (!Op.isReg() || !Op.getReg().isVirtual())
7391 continue;
7392
7393 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7394 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7395 if (VRC == OpRC)
7396 continue;
7397
7398 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7399 Op.setIsKill();
7400 }
7401 }
7402
7403 return CreatedBB;
7404 }
7405
7406 // Legalize INSERT_SUBREG
7407 // src0 must have the same register class as dst
7408 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7409 Register Dst = MI.getOperand(0).getReg();
7410 Register Src0 = MI.getOperand(1).getReg();
7411 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7412 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7413 if (DstRC != Src0RC) {
7414 MachineBasicBlock *MBB = MI.getParent();
7415 MachineOperand &Op = MI.getOperand(1);
7416 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7417 }
7418 return CreatedBB;
7419 }
7420
7421 // Legalize SI_INIT_M0
7422 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7423 MachineOperand &Src = MI.getOperand(0);
7424 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7425 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7426 return CreatedBB;
7427 }
7428
7429 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7430 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7431 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7432 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7433 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7434 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7435 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7436 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7437 MachineOperand &Src = MI.getOperand(1);
7438 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7439 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7440 return CreatedBB;
7441 }
7442
7443 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7444 //
7445 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7446 // scratch memory access. In both cases, the legalization never involves
7447 // conversion to the addr64 form.
7449 (isMUBUF(MI) || isMTBUF(MI)))) {
7450 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7451 ? AMDGPU::OpName::rsrc
7452 : AMDGPU::OpName::srsrc;
7453 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7454 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7455 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7456
7457 AMDGPU::OpName SampOpName =
7458 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7459 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7460 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7461 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7462
7463 return CreatedBB;
7464 }
7465
7466 // Legalize SI_CALL
7467 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7468 MachineOperand *Dest = &MI.getOperand(0);
7469 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7470 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7471 // following copies, we also need to move copies from and to physical
7472 // registers into the loop block.
7473 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7474 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7475
7476 // Also move the copies to physical registers into the loop block
7477 MachineBasicBlock &MBB = *MI.getParent();
7479 while (Start->getOpcode() != FrameSetupOpcode)
7480 --Start;
7482 while (End->getOpcode() != FrameDestroyOpcode)
7483 ++End;
7484 // Also include following copies of the return value
7485 ++End;
7486 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7487 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7488 ++End;
7489 CreatedBB =
7490 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7491 }
7492 }
7493
7494 // Legalize s_sleep_var.
7495 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7496 const DebugLoc &DL = MI.getDebugLoc();
7497 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7498 int Src0Idx =
7499 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7500 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7501 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7502 .add(Src0);
7503 Src0.ChangeToRegister(Reg, false);
7504 return nullptr;
7505 }
7506
7507 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7508 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7509 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7510 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7511 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7512 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7513 for (MachineOperand &Src : MI.explicit_operands()) {
7514 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7515 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7516 }
7517 return CreatedBB;
7518 }
7519
7520 // Legalize MUBUF instructions.
7521 bool isSoffsetLegal = true;
7522 int SoffsetIdx =
7523 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7524 if (SoffsetIdx != -1) {
7525 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7526 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7527 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7528 isSoffsetLegal = false;
7529 }
7530 }
7531
7532 bool isRsrcLegal = true;
7533 int RsrcIdx =
7534 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7535 if (RsrcIdx != -1) {
7536 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7537 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7538 isRsrcLegal = false;
7539 }
7540
7541 // The operands are legal.
7542 if (isRsrcLegal && isSoffsetLegal)
7543 return CreatedBB;
7544
7545 if (!isRsrcLegal) {
7546 // Legalize a VGPR Rsrc
7547 //
7548 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7549 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7550 // a zero-value SRsrc.
7551 //
7552 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7553 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7554 // above.
7555 //
7556 // Otherwise we are on non-ADDR64 hardware, and/or we have
7557 // idxen/offen/bothen and we fall back to a waterfall loop.
7558
7559 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7560 MachineBasicBlock &MBB = *MI.getParent();
7561
7562 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7563 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7564 // This is already an ADDR64 instruction so we need to add the pointer
7565 // extracted from the resource descriptor to the current value of VAddr.
7566 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7567 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7568 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7569
7570 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7571 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7572 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7573
7574 unsigned RsrcPtr, NewSRsrc;
7575 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7576
7577 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7578 const DebugLoc &DL = MI.getDebugLoc();
7579 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7580 .addDef(CondReg0)
7581 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7582 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7583 .addImm(0);
7584
7585 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7586 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7587 .addDef(CondReg1, RegState::Dead)
7588 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7589 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7590 .addReg(CondReg0, RegState::Kill)
7591 .addImm(0);
7592
7593 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7594 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7595 .addReg(NewVAddrLo)
7596 .addImm(AMDGPU::sub0)
7597 .addReg(NewVAddrHi)
7598 .addImm(AMDGPU::sub1);
7599
7600 VAddr->setReg(NewVAddr);
7601 Rsrc->setReg(NewSRsrc);
7602 } else if (!VAddr && ST.hasAddr64()) {
7603 // This instructions is the _OFFSET variant, so we need to convert it to
7604 // ADDR64.
7605 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7606 "FIXME: Need to emit flat atomics here");
7607
7608 unsigned RsrcPtr, NewSRsrc;
7609 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7610
7611 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7612 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7613 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7614 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7615 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7616
7617 // Atomics with return have an additional tied operand and are
7618 // missing some of the special bits.
7619 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7620 MachineInstr *Addr64;
7621
7622 if (!VDataIn) {
7623 // Regular buffer load / store.
7625 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7626 .add(*VData)
7627 .addReg(NewVAddr)
7628 .addReg(NewSRsrc)
7629 .add(*SOffset)
7630 .add(*Offset);
7631
7632 if (const MachineOperand *CPol =
7633 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7634 MIB.addImm(CPol->getImm());
7635 }
7636
7637 if (const MachineOperand *TFE =
7638 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7639 MIB.addImm(TFE->getImm());
7640 }
7641
7642 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7643
7644 MIB.cloneMemRefs(MI);
7645 Addr64 = MIB;
7646 } else {
7647 // Atomics with return.
7648 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7649 .add(*VData)
7650 .add(*VDataIn)
7651 .addReg(NewVAddr)
7652 .addReg(NewSRsrc)
7653 .add(*SOffset)
7654 .add(*Offset)
7655 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7656 .cloneMemRefs(MI);
7657 }
7658
7659 MI.removeFromParent();
7660
7661 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7662 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7663 NewVAddr)
7664 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7665 .addImm(AMDGPU::sub0)
7666 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7667 .addImm(AMDGPU::sub1);
7668 } else {
7669 // Legalize a VGPR Rsrc and soffset together.
7670 if (!isSoffsetLegal) {
7671 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7672 CreatedBB =
7673 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7674 return CreatedBB;
7675 }
7676 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7677 return CreatedBB;
7678 }
7679 }
7680
7681 // Legalize a VGPR soffset.
7682 if (!isSoffsetLegal) {
7683 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7684 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7685 return CreatedBB;
7686 }
7687 return CreatedBB;
7688}
7689
7691 InstrList.insert(MI);
7692 // Add MBUF instructiosn to deferred list.
7693 int RsrcIdx =
7694 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7695 if (RsrcIdx != -1) {
7696 DeferredList.insert(MI);
7697 }
7698}
7699
7701 return DeferredList.contains(MI);
7702}
7703
7704// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7705// lowering (change spgr to vgpr).
7706// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7707// size. Need to legalize the size of the operands during the vgpr lowering
7708// chain. This can be removed after we have sgpr16 in place
7710 MachineRegisterInfo &MRI) const {
7711 if (!ST.useRealTrue16Insts())
7712 return;
7713
7714 unsigned Opcode = MI.getOpcode();
7715 MachineBasicBlock *MBB = MI.getParent();
7716 // Legalize operands and check for size mismatch
7717 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7718 OpIdx >= get(Opcode).getNumOperands() ||
7719 get(Opcode).operands()[OpIdx].RegClass == -1)
7720 return;
7721
7722 MachineOperand &Op = MI.getOperand(OpIdx);
7723 if (!Op.isReg() || !Op.getReg().isVirtual())
7724 return;
7725
7726 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7727 if (!RI.isVGPRClass(CurrRC))
7728 return;
7729
7730 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7731 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7732 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7733 Op.setSubReg(AMDGPU::lo16);
7734 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7735 const DebugLoc &DL = MI.getDebugLoc();
7736 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7737 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7738 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7739 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7740 .addReg(Op.getReg())
7741 .addImm(AMDGPU::lo16)
7742 .addReg(Undef)
7743 .addImm(AMDGPU::hi16);
7744 Op.setReg(NewDstReg);
7745 }
7746}
7748 MachineRegisterInfo &MRI) const {
7749 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7751}
7752
7754 MachineDominatorTree *MDT) const {
7755
7756 while (!Worklist.empty()) {
7757 MachineInstr &Inst = *Worklist.top();
7758 Worklist.erase_top();
7759 // Skip MachineInstr in the deferred list.
7760 if (Worklist.isDeferred(&Inst))
7761 continue;
7762 moveToVALUImpl(Worklist, MDT, Inst);
7763 }
7764
7765 // Deferred list of instructions will be processed once
7766 // all the MachineInstr in the worklist are done.
7767 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7768 moveToVALUImpl(Worklist, MDT, *Inst);
7769 assert(Worklist.empty() &&
7770 "Deferred MachineInstr are not supposed to re-populate worklist");
7771 }
7772}
7773
7776 MachineInstr &Inst) const {
7777
7779 if (!MBB)
7780 return;
7781 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7782 unsigned Opcode = Inst.getOpcode();
7783 unsigned NewOpcode = getVALUOp(Inst);
7784 const DebugLoc &DL = Inst.getDebugLoc();
7785
7786 // Handle some special cases
7787 switch (Opcode) {
7788 default:
7789 break;
7790 case AMDGPU::S_ADD_I32:
7791 case AMDGPU::S_SUB_I32: {
7792 // FIXME: The u32 versions currently selected use the carry.
7793 bool Changed;
7794 MachineBasicBlock *CreatedBBTmp = nullptr;
7795 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7796 if (Changed)
7797 return;
7798
7799 // Default handling
7800 break;
7801 }
7802
7803 case AMDGPU::S_MUL_U64:
7804 if (ST.hasVectorMulU64()) {
7805 NewOpcode = AMDGPU::V_MUL_U64_e64;
7806 break;
7807 }
7808 // Split s_mul_u64 in 32-bit vector multiplications.
7809 splitScalarSMulU64(Worklist, Inst, MDT);
7810 Inst.eraseFromParent();
7811 return;
7812
7813 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7814 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7815 // This is a special case of s_mul_u64 where all the operands are either
7816 // zero extended or sign extended.
7817 splitScalarSMulPseudo(Worklist, Inst, MDT);
7818 Inst.eraseFromParent();
7819 return;
7820
7821 case AMDGPU::S_AND_B64:
7822 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7823 Inst.eraseFromParent();
7824 return;
7825
7826 case AMDGPU::S_OR_B64:
7827 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7828 Inst.eraseFromParent();
7829 return;
7830
7831 case AMDGPU::S_XOR_B64:
7832 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7833 Inst.eraseFromParent();
7834 return;
7835
7836 case AMDGPU::S_NAND_B64:
7837 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7838 Inst.eraseFromParent();
7839 return;
7840
7841 case AMDGPU::S_NOR_B64:
7842 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7843 Inst.eraseFromParent();
7844 return;
7845
7846 case AMDGPU::S_XNOR_B64:
7847 if (ST.hasDLInsts())
7848 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7849 else
7850 splitScalar64BitXnor(Worklist, Inst, MDT);
7851 Inst.eraseFromParent();
7852 return;
7853
7854 case AMDGPU::S_ANDN2_B64:
7855 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7856 Inst.eraseFromParent();
7857 return;
7858
7859 case AMDGPU::S_ORN2_B64:
7860 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7861 Inst.eraseFromParent();
7862 return;
7863
7864 case AMDGPU::S_BREV_B64:
7865 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7866 Inst.eraseFromParent();
7867 return;
7868
7869 case AMDGPU::S_NOT_B64:
7870 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7871 Inst.eraseFromParent();
7872 return;
7873
7874 case AMDGPU::S_BCNT1_I32_B64:
7875 splitScalar64BitBCNT(Worklist, Inst);
7876 Inst.eraseFromParent();
7877 return;
7878
7879 case AMDGPU::S_BFE_I64:
7880 splitScalar64BitBFE(Worklist, Inst);
7881 Inst.eraseFromParent();
7882 return;
7883
7884 case AMDGPU::S_FLBIT_I32_B64:
7885 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7886 Inst.eraseFromParent();
7887 return;
7888 case AMDGPU::S_FF1_I32_B64:
7889 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7890 Inst.eraseFromParent();
7891 return;
7892
7893 case AMDGPU::S_LSHL_B32:
7894 if (ST.hasOnlyRevVALUShifts()) {
7895 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7896 swapOperands(Inst);
7897 }
7898 break;
7899 case AMDGPU::S_ASHR_I32:
7900 if (ST.hasOnlyRevVALUShifts()) {
7901 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7902 swapOperands(Inst);
7903 }
7904 break;
7905 case AMDGPU::S_LSHR_B32:
7906 if (ST.hasOnlyRevVALUShifts()) {
7907 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7908 swapOperands(Inst);
7909 }
7910 break;
7911 case AMDGPU::S_LSHL_B64:
7912 if (ST.hasOnlyRevVALUShifts()) {
7913 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7914 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7915 : AMDGPU::V_LSHLREV_B64_e64;
7916 swapOperands(Inst);
7917 }
7918 break;
7919 case AMDGPU::S_ASHR_I64:
7920 if (ST.hasOnlyRevVALUShifts()) {
7921 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7922 swapOperands(Inst);
7923 }
7924 break;
7925 case AMDGPU::S_LSHR_B64:
7926 if (ST.hasOnlyRevVALUShifts()) {
7927 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7928 swapOperands(Inst);
7929 }
7930 break;
7931
7932 case AMDGPU::S_ABS_I32:
7933 lowerScalarAbs(Worklist, Inst);
7934 Inst.eraseFromParent();
7935 return;
7936
7937 case AMDGPU::S_ABSDIFF_I32:
7938 lowerScalarAbsDiff(Worklist, Inst);
7939 Inst.eraseFromParent();
7940 return;
7941
7942 case AMDGPU::S_CBRANCH_SCC0:
7943 case AMDGPU::S_CBRANCH_SCC1: {
7944 // Clear unused bits of vcc
7945 Register CondReg = Inst.getOperand(1).getReg();
7946 bool IsSCC = CondReg == AMDGPU::SCC;
7948 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7949 .addReg(LMC.ExecReg)
7950 .addReg(IsSCC ? LMC.VccReg : CondReg);
7951 Inst.removeOperand(1);
7952 } break;
7953
7954 case AMDGPU::S_BFE_U64:
7955 case AMDGPU::S_BFM_B64:
7956 llvm_unreachable("Moving this op to VALU not implemented");
7957
7958 case AMDGPU::S_PACK_LL_B32_B16:
7959 case AMDGPU::S_PACK_LH_B32_B16:
7960 case AMDGPU::S_PACK_HL_B32_B16:
7961 case AMDGPU::S_PACK_HH_B32_B16:
7962 movePackToVALU(Worklist, MRI, Inst);
7963 Inst.eraseFromParent();
7964 return;
7965
7966 case AMDGPU::S_XNOR_B32:
7967 lowerScalarXnor(Worklist, Inst);
7968 Inst.eraseFromParent();
7969 return;
7970
7971 case AMDGPU::S_NAND_B32:
7972 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7973 Inst.eraseFromParent();
7974 return;
7975
7976 case AMDGPU::S_NOR_B32:
7977 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7978 Inst.eraseFromParent();
7979 return;
7980
7981 case AMDGPU::S_ANDN2_B32:
7982 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7983 Inst.eraseFromParent();
7984 return;
7985
7986 case AMDGPU::S_ORN2_B32:
7987 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7988 Inst.eraseFromParent();
7989 return;
7990
7991 // TODO: remove as soon as everything is ready
7992 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7993 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7994 // can only be selected from the uniform SDNode.
7995 case AMDGPU::S_ADD_CO_PSEUDO:
7996 case AMDGPU::S_SUB_CO_PSEUDO: {
7997 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7998 ? AMDGPU::V_ADDC_U32_e64
7999 : AMDGPU::V_SUBB_U32_e64;
8000 const auto *CarryRC = RI.getWaveMaskRegClass();
8001
8002 Register CarryInReg = Inst.getOperand(4).getReg();
8003 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
8004 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
8005 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
8006 .addReg(CarryInReg);
8007 }
8008
8009 Register CarryOutReg = Inst.getOperand(1).getReg();
8010
8011 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8012 MRI.getRegClass(Inst.getOperand(0).getReg())));
8013 MachineInstr *CarryOp =
8014 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8015 .addReg(CarryOutReg, RegState::Define)
8016 .add(Inst.getOperand(2))
8017 .add(Inst.getOperand(3))
8018 .addReg(CarryInReg)
8019 .addImm(0);
8020 legalizeOperands(*CarryOp);
8021 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8022 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8023 Inst.eraseFromParent();
8024 }
8025 return;
8026 case AMDGPU::S_UADDO_PSEUDO:
8027 case AMDGPU::S_USUBO_PSEUDO: {
8028 MachineOperand &Dest0 = Inst.getOperand(0);
8029 MachineOperand &Dest1 = Inst.getOperand(1);
8030 MachineOperand &Src0 = Inst.getOperand(2);
8031 MachineOperand &Src1 = Inst.getOperand(3);
8032
8033 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8034 ? AMDGPU::V_ADD_CO_U32_e64
8035 : AMDGPU::V_SUB_CO_U32_e64;
8036 const TargetRegisterClass *NewRC =
8037 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8038 Register DestReg = MRI.createVirtualRegister(NewRC);
8039 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8040 .addReg(Dest1.getReg(), RegState::Define)
8041 .add(Src0)
8042 .add(Src1)
8043 .addImm(0); // clamp bit
8044
8045 legalizeOperands(*NewInstr, MDT);
8046 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8047 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8048 Inst.eraseFromParent();
8049 }
8050 return;
8051 case AMDGPU::S_LSHL1_ADD_U32:
8052 case AMDGPU::S_LSHL2_ADD_U32:
8053 case AMDGPU::S_LSHL3_ADD_U32:
8054 case AMDGPU::S_LSHL4_ADD_U32: {
8055 MachineOperand &Dest = Inst.getOperand(0);
8056 MachineOperand &Src0 = Inst.getOperand(1);
8057 MachineOperand &Src1 = Inst.getOperand(2);
8058 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8059 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8060 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8061 : 4);
8062
8063 const TargetRegisterClass *NewRC =
8064 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8065 Register DestReg = MRI.createVirtualRegister(NewRC);
8066 MachineInstr *NewInstr =
8067 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8068 .add(Src0)
8069 .addImm(ShiftAmt)
8070 .add(Src1);
8071
8072 legalizeOperands(*NewInstr, MDT);
8073 MRI.replaceRegWith(Dest.getReg(), DestReg);
8074 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8075 Inst.eraseFromParent();
8076 }
8077 return;
8078 case AMDGPU::S_CSELECT_B32:
8079 case AMDGPU::S_CSELECT_B64:
8080 lowerSelect(Worklist, Inst, MDT);
8081 Inst.eraseFromParent();
8082 return;
8083 case AMDGPU::S_CMP_EQ_I32:
8084 case AMDGPU::S_CMP_LG_I32:
8085 case AMDGPU::S_CMP_GT_I32:
8086 case AMDGPU::S_CMP_GE_I32:
8087 case AMDGPU::S_CMP_LT_I32:
8088 case AMDGPU::S_CMP_LE_I32:
8089 case AMDGPU::S_CMP_EQ_U32:
8090 case AMDGPU::S_CMP_LG_U32:
8091 case AMDGPU::S_CMP_GT_U32:
8092 case AMDGPU::S_CMP_GE_U32:
8093 case AMDGPU::S_CMP_LT_U32:
8094 case AMDGPU::S_CMP_LE_U32:
8095 case AMDGPU::S_CMP_EQ_U64:
8096 case AMDGPU::S_CMP_LG_U64:
8097 case AMDGPU::S_CMP_LT_F32:
8098 case AMDGPU::S_CMP_EQ_F32:
8099 case AMDGPU::S_CMP_LE_F32:
8100 case AMDGPU::S_CMP_GT_F32:
8101 case AMDGPU::S_CMP_LG_F32:
8102 case AMDGPU::S_CMP_GE_F32:
8103 case AMDGPU::S_CMP_O_F32:
8104 case AMDGPU::S_CMP_U_F32:
8105 case AMDGPU::S_CMP_NGE_F32:
8106 case AMDGPU::S_CMP_NLG_F32:
8107 case AMDGPU::S_CMP_NGT_F32:
8108 case AMDGPU::S_CMP_NLE_F32:
8109 case AMDGPU::S_CMP_NEQ_F32:
8110 case AMDGPU::S_CMP_NLT_F32: {
8111 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8112 auto NewInstr =
8113 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8114 .setMIFlags(Inst.getFlags());
8115 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8116 0) {
8117 NewInstr
8118 .addImm(0) // src0_modifiers
8119 .add(Inst.getOperand(0)) // src0
8120 .addImm(0) // src1_modifiers
8121 .add(Inst.getOperand(1)) // src1
8122 .addImm(0); // clamp
8123 } else {
8124 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8125 }
8126 legalizeOperands(*NewInstr, MDT);
8127 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8128 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8129 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8130 Inst.eraseFromParent();
8131 return;
8132 }
8133 case AMDGPU::S_CMP_LT_F16:
8134 case AMDGPU::S_CMP_EQ_F16:
8135 case AMDGPU::S_CMP_LE_F16:
8136 case AMDGPU::S_CMP_GT_F16:
8137 case AMDGPU::S_CMP_LG_F16:
8138 case AMDGPU::S_CMP_GE_F16:
8139 case AMDGPU::S_CMP_O_F16:
8140 case AMDGPU::S_CMP_U_F16:
8141 case AMDGPU::S_CMP_NGE_F16:
8142 case AMDGPU::S_CMP_NLG_F16:
8143 case AMDGPU::S_CMP_NGT_F16:
8144 case AMDGPU::S_CMP_NLE_F16:
8145 case AMDGPU::S_CMP_NEQ_F16:
8146 case AMDGPU::S_CMP_NLT_F16: {
8147 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8148 auto NewInstr =
8149 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8150 .setMIFlags(Inst.getFlags());
8151 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8152 NewInstr
8153 .addImm(0) // src0_modifiers
8154 .add(Inst.getOperand(0)) // src0
8155 .addImm(0) // src1_modifiers
8156 .add(Inst.getOperand(1)) // src1
8157 .addImm(0); // clamp
8158 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8159 NewInstr.addImm(0); // op_sel0
8160 } else {
8161 NewInstr
8162 .add(Inst.getOperand(0))
8163 .add(Inst.getOperand(1));
8164 }
8165 legalizeOperandsVALUt16(*NewInstr, MRI);
8166 legalizeOperands(*NewInstr, MDT);
8167 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8168 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8169 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8170 Inst.eraseFromParent();
8171 return;
8172 }
8173 case AMDGPU::S_CVT_HI_F32_F16: {
8174 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8175 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8176 if (ST.useRealTrue16Insts()) {
8177 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8178 .add(Inst.getOperand(1));
8179 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8180 .addImm(0) // src0_modifiers
8181 .addReg(TmpReg, 0, AMDGPU::hi16)
8182 .addImm(0) // clamp
8183 .addImm(0) // omod
8184 .addImm(0); // op_sel0
8185 } else {
8186 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8187 .addImm(16)
8188 .add(Inst.getOperand(1));
8189 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8190 .addImm(0) // src0_modifiers
8191 .addReg(TmpReg)
8192 .addImm(0) // clamp
8193 .addImm(0); // omod
8194 }
8195
8196 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8197 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8198 Inst.eraseFromParent();
8199 return;
8200 }
8201 case AMDGPU::S_MINIMUM_F32:
8202 case AMDGPU::S_MAXIMUM_F32: {
8203 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8204 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8205 .addImm(0) // src0_modifiers
8206 .add(Inst.getOperand(1))
8207 .addImm(0) // src1_modifiers
8208 .add(Inst.getOperand(2))
8209 .addImm(0) // clamp
8210 .addImm(0); // omod
8211 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8212
8213 legalizeOperands(*NewInstr, MDT);
8214 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8215 Inst.eraseFromParent();
8216 return;
8217 }
8218 case AMDGPU::S_MINIMUM_F16:
8219 case AMDGPU::S_MAXIMUM_F16: {
8220 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8221 ? &AMDGPU::VGPR_16RegClass
8222 : &AMDGPU::VGPR_32RegClass);
8223 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8224 .addImm(0) // src0_modifiers
8225 .add(Inst.getOperand(1))
8226 .addImm(0) // src1_modifiers
8227 .add(Inst.getOperand(2))
8228 .addImm(0) // clamp
8229 .addImm(0) // omod
8230 .addImm(0); // opsel0
8231 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8232 legalizeOperandsVALUt16(*NewInstr, MRI);
8233 legalizeOperands(*NewInstr, MDT);
8234 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8235 Inst.eraseFromParent();
8236 return;
8237 }
8238 case AMDGPU::V_S_EXP_F16_e64:
8239 case AMDGPU::V_S_LOG_F16_e64:
8240 case AMDGPU::V_S_RCP_F16_e64:
8241 case AMDGPU::V_S_RSQ_F16_e64:
8242 case AMDGPU::V_S_SQRT_F16_e64: {
8243 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8244 ? &AMDGPU::VGPR_16RegClass
8245 : &AMDGPU::VGPR_32RegClass);
8246 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8247 .add(Inst.getOperand(1)) // src0_modifiers
8248 .add(Inst.getOperand(2))
8249 .add(Inst.getOperand(3)) // clamp
8250 .add(Inst.getOperand(4)) // omod
8251 .setMIFlags(Inst.getFlags());
8252 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8253 NewInstr.addImm(0); // opsel0
8254 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8255 legalizeOperandsVALUt16(*NewInstr, MRI);
8256 legalizeOperands(*NewInstr, MDT);
8257 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8258 Inst.eraseFromParent();
8259 return;
8260 }
8261 }
8262
8263 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8264 // We cannot move this instruction to the VALU, so we should try to
8265 // legalize its operands instead.
8266 legalizeOperands(Inst, MDT);
8267 return;
8268 }
8269 // Handle converting generic instructions like COPY-to-SGPR into
8270 // COPY-to-VGPR.
8271 if (NewOpcode == Opcode) {
8272 Register DstReg = Inst.getOperand(0).getReg();
8273 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8274
8275 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8276 // hope for the best.
8277 if (Inst.isCopy() && DstReg.isPhysical() &&
8278 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8279 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8280 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8281 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8282 .add(Inst.getOperand(1));
8283 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8284 DstReg)
8285 .addReg(NewDst);
8286
8287 Inst.eraseFromParent();
8288 return;
8289 }
8290
8291 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8292 Register NewDstReg = Inst.getOperand(1).getReg();
8293 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8294 if (const TargetRegisterClass *CommonRC =
8295 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8296 // Instead of creating a copy where src and dst are the same register
8297 // class, we just replace all uses of dst with src. These kinds of
8298 // copies interfere with the heuristics MachineSink uses to decide
8299 // whether or not to split a critical edge. Since the pass assumes
8300 // that copies will end up as machine instructions and not be
8301 // eliminated.
8302 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8303 MRI.replaceRegWith(DstReg, NewDstReg);
8304 MRI.clearKillFlags(NewDstReg);
8305 Inst.getOperand(0).setReg(DstReg);
8306
8307 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8308 llvm_unreachable("failed to constrain register");
8309
8310 Inst.eraseFromParent();
8311 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8312 for (MachineOperand &MO :
8313 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8314 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8315 }
8316
8317 return;
8318 }
8319 }
8320
8321 // If this is a v2s copy between 16bit and 32bit reg,
8322 // replace vgpr copy to reg_sequence/extract_subreg
8323 // This can be remove after we have sgpr16 in place
8324 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8325 Inst.getOperand(1).getReg().isVirtual() &&
8326 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8327 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8328 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8329 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8330 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8331 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8332 get(AMDGPU::IMPLICIT_DEF), Undef);
8333 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8334 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8335 .addReg(Inst.getOperand(1).getReg())
8336 .addImm(AMDGPU::lo16)
8337 .addReg(Undef)
8338 .addImm(AMDGPU::hi16);
8339 Inst.eraseFromParent();
8340 MRI.replaceRegWith(DstReg, NewDstReg);
8341 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8342 return;
8343 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8344 AMDGPU::lo16)) {
8345 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8346 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8347 MRI.replaceRegWith(DstReg, NewDstReg);
8348 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8349 return;
8350 }
8351 }
8352
8353 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8354 MRI.replaceRegWith(DstReg, NewDstReg);
8355 legalizeOperands(Inst, MDT);
8356 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8357 return;
8358 }
8359
8360 // Use the new VALU Opcode.
8361 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8362 .setMIFlags(Inst.getFlags());
8363 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8364 // Intersperse VOP3 modifiers among the SALU operands.
8365 NewInstr->addOperand(Inst.getOperand(0));
8366 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8367 AMDGPU::OpName::src0_modifiers) >= 0)
8368 NewInstr.addImm(0);
8369 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8370 const MachineOperand &Src = Inst.getOperand(1);
8371 NewInstr->addOperand(Src);
8372 }
8373
8374 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8375 // We are converting these to a BFE, so we need to add the missing
8376 // operands for the size and offset.
8377 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8378 NewInstr.addImm(0);
8379 NewInstr.addImm(Size);
8380 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8381 // The VALU version adds the second operand to the result, so insert an
8382 // extra 0 operand.
8383 NewInstr.addImm(0);
8384 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8385 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8386 // If we need to move this to VGPRs, we need to unpack the second
8387 // operand back into the 2 separate ones for bit offset and width.
8388 assert(OffsetWidthOp.isImm() &&
8389 "Scalar BFE is only implemented for constant width and offset");
8390 uint32_t Imm = OffsetWidthOp.getImm();
8391
8392 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8393 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8394 NewInstr.addImm(Offset);
8395 NewInstr.addImm(BitWidth);
8396 } else {
8397 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8398 AMDGPU::OpName::src1_modifiers) >= 0)
8399 NewInstr.addImm(0);
8400 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8401 NewInstr->addOperand(Inst.getOperand(2));
8402 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8403 AMDGPU::OpName::src2_modifiers) >= 0)
8404 NewInstr.addImm(0);
8405 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8406 NewInstr->addOperand(Inst.getOperand(3));
8407 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8408 NewInstr.addImm(0);
8409 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8410 NewInstr.addImm(0);
8411 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8412 NewInstr.addImm(0);
8413 }
8414 } else {
8415 // Just copy the SALU operands.
8416 for (const MachineOperand &Op : Inst.explicit_operands())
8417 NewInstr->addOperand(Op);
8418 }
8419
8420 // Remove any references to SCC. Vector instructions can't read from it, and
8421 // We're just about to add the implicit use / defs of VCC, and we don't want
8422 // both.
8423 for (MachineOperand &Op : Inst.implicit_operands()) {
8424 if (Op.getReg() == AMDGPU::SCC) {
8425 // Only propagate through live-def of SCC.
8426 if (Op.isDef() && !Op.isDead())
8427 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8428 if (Op.isUse())
8429 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8430 }
8431 }
8432 Inst.eraseFromParent();
8433 Register NewDstReg;
8434 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8435 Register DstReg = NewInstr->getOperand(0).getReg();
8436 assert(DstReg.isVirtual());
8437 // Update the destination register class.
8438 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8439 assert(NewDstRC);
8440 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8441 MRI.replaceRegWith(DstReg, NewDstReg);
8442 }
8443 fixImplicitOperands(*NewInstr);
8444
8445 legalizeOperandsVALUt16(*NewInstr, MRI);
8446
8447 // Legalize the operands
8448 legalizeOperands(*NewInstr, MDT);
8449 if (NewDstReg)
8450 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8451}
8452
8453// Add/sub require special handling to deal with carry outs.
8454std::pair<bool, MachineBasicBlock *>
8455SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8456 MachineDominatorTree *MDT) const {
8457 if (ST.hasAddNoCarry()) {
8458 // Assume there is no user of scc since we don't select this in that case.
8459 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8460 // is used.
8461
8462 MachineBasicBlock &MBB = *Inst.getParent();
8463 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8464
8465 Register OldDstReg = Inst.getOperand(0).getReg();
8466 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8467
8468 unsigned Opc = Inst.getOpcode();
8469 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8470
8471 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8472 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8473
8474 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8475 Inst.removeOperand(3);
8476
8477 Inst.setDesc(get(NewOpc));
8478 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8479 Inst.addImplicitDefUseOperands(*MBB.getParent());
8480 MRI.replaceRegWith(OldDstReg, ResultReg);
8481 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8482
8483 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8484 return std::pair(true, NewBB);
8485 }
8486
8487 return std::pair(false, nullptr);
8488}
8489
8490void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8491 MachineDominatorTree *MDT) const {
8492
8493 MachineBasicBlock &MBB = *Inst.getParent();
8494 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8495 MachineBasicBlock::iterator MII = Inst;
8496 const DebugLoc &DL = Inst.getDebugLoc();
8497
8498 MachineOperand &Dest = Inst.getOperand(0);
8499 MachineOperand &Src0 = Inst.getOperand(1);
8500 MachineOperand &Src1 = Inst.getOperand(2);
8501 MachineOperand &Cond = Inst.getOperand(3);
8502
8503 Register CondReg = Cond.getReg();
8504 bool IsSCC = (CondReg == AMDGPU::SCC);
8505
8506 // If this is a trivial select where the condition is effectively not SCC
8507 // (CondReg is a source of copy to SCC), then the select is semantically
8508 // equivalent to copying CondReg. Hence, there is no need to create
8509 // V_CNDMASK, we can just use that and bail out.
8510 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8511 (Src1.getImm() == 0)) {
8512 MRI.replaceRegWith(Dest.getReg(), CondReg);
8513 return;
8514 }
8515
8516 Register NewCondReg = CondReg;
8517 if (IsSCC) {
8518 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8519 NewCondReg = MRI.createVirtualRegister(TC);
8520
8521 // Now look for the closest SCC def if it is a copy
8522 // replacing the CondReg with the COPY source register
8523 bool CopyFound = false;
8524 for (MachineInstr &CandI :
8526 Inst.getParent()->rend())) {
8527 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8528 -1) {
8529 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8530 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8531 .addReg(CandI.getOperand(1).getReg());
8532 CopyFound = true;
8533 }
8534 break;
8535 }
8536 }
8537 if (!CopyFound) {
8538 // SCC def is not a copy
8539 // Insert a trivial select instead of creating a copy, because a copy from
8540 // SCC would semantically mean just copying a single bit, but we may need
8541 // the result to be a vector condition mask that needs preserving.
8542 unsigned Opcode =
8543 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8544 auto NewSelect =
8545 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8546 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8547 }
8548 }
8549
8550 Register NewDestReg = MRI.createVirtualRegister(
8551 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8552 MachineInstr *NewInst;
8553 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8554 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8555 .addImm(0)
8556 .add(Src1) // False
8557 .addImm(0)
8558 .add(Src0) // True
8559 .addReg(NewCondReg);
8560 } else {
8561 NewInst =
8562 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8563 .add(Src1) // False
8564 .add(Src0) // True
8565 .addReg(NewCondReg);
8566 }
8567 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8568 legalizeOperands(*NewInst, MDT);
8569 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8570}
8571
8572void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8573 MachineInstr &Inst) const {
8574 MachineBasicBlock &MBB = *Inst.getParent();
8575 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8576 MachineBasicBlock::iterator MII = Inst;
8577 const DebugLoc &DL = Inst.getDebugLoc();
8578
8579 MachineOperand &Dest = Inst.getOperand(0);
8580 MachineOperand &Src = Inst.getOperand(1);
8581 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8582 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8583
8584 unsigned SubOp = ST.hasAddNoCarry() ?
8585 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8586
8587 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8588 .addImm(0)
8589 .addReg(Src.getReg());
8590
8591 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8592 .addReg(Src.getReg())
8593 .addReg(TmpReg);
8594
8595 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8596 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8597}
8598
8599void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8600 MachineInstr &Inst) const {
8601 MachineBasicBlock &MBB = *Inst.getParent();
8602 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8603 MachineBasicBlock::iterator MII = Inst;
8604 const DebugLoc &DL = Inst.getDebugLoc();
8605
8606 MachineOperand &Dest = Inst.getOperand(0);
8607 MachineOperand &Src1 = Inst.getOperand(1);
8608 MachineOperand &Src2 = Inst.getOperand(2);
8609 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8610 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8611 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8612
8613 unsigned SubOp =
8614 ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8615
8616 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8617 .addReg(Src1.getReg())
8618 .addReg(Src2.getReg());
8619
8620 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8621
8622 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8623 .addReg(SubResultReg)
8624 .addReg(TmpReg);
8625
8626 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8627 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8628}
8629
8630void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8631 MachineInstr &Inst) const {
8632 MachineBasicBlock &MBB = *Inst.getParent();
8633 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8634 MachineBasicBlock::iterator MII = Inst;
8635 const DebugLoc &DL = Inst.getDebugLoc();
8636
8637 MachineOperand &Dest = Inst.getOperand(0);
8638 MachineOperand &Src0 = Inst.getOperand(1);
8639 MachineOperand &Src1 = Inst.getOperand(2);
8640
8641 if (ST.hasDLInsts()) {
8642 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8643 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8644 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8645
8646 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8647 .add(Src0)
8648 .add(Src1);
8649
8650 MRI.replaceRegWith(Dest.getReg(), NewDest);
8651 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8652 } else {
8653 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8654 // invert either source and then perform the XOR. If either source is a
8655 // scalar register, then we can leave the inversion on the scalar unit to
8656 // achieve a better distribution of scalar and vector instructions.
8657 bool Src0IsSGPR = Src0.isReg() &&
8658 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8659 bool Src1IsSGPR = Src1.isReg() &&
8660 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8661 MachineInstr *Xor;
8662 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8663 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8664
8665 // Build a pair of scalar instructions and add them to the work list.
8666 // The next iteration over the work list will lower these to the vector
8667 // unit as necessary.
8668 if (Src0IsSGPR) {
8669 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8670 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8671 .addReg(Temp)
8672 .add(Src1);
8673 } else if (Src1IsSGPR) {
8674 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8675 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8676 .add(Src0)
8677 .addReg(Temp);
8678 } else {
8679 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8680 .add(Src0)
8681 .add(Src1);
8682 MachineInstr *Not =
8683 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8684 Worklist.insert(Not);
8685 }
8686
8687 MRI.replaceRegWith(Dest.getReg(), NewDest);
8688
8689 Worklist.insert(Xor);
8690
8691 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8692 }
8693}
8694
8695void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8696 MachineInstr &Inst,
8697 unsigned Opcode) const {
8698 MachineBasicBlock &MBB = *Inst.getParent();
8699 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8700 MachineBasicBlock::iterator MII = Inst;
8701 const DebugLoc &DL = Inst.getDebugLoc();
8702
8703 MachineOperand &Dest = Inst.getOperand(0);
8704 MachineOperand &Src0 = Inst.getOperand(1);
8705 MachineOperand &Src1 = Inst.getOperand(2);
8706
8707 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8708 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8709
8710 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8711 .add(Src0)
8712 .add(Src1);
8713
8714 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8715 .addReg(Interm);
8716
8717 Worklist.insert(&Op);
8718 Worklist.insert(&Not);
8719
8720 MRI.replaceRegWith(Dest.getReg(), NewDest);
8721 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8722}
8723
8724void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8725 MachineInstr &Inst,
8726 unsigned Opcode) const {
8727 MachineBasicBlock &MBB = *Inst.getParent();
8728 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8729 MachineBasicBlock::iterator MII = Inst;
8730 const DebugLoc &DL = Inst.getDebugLoc();
8731
8732 MachineOperand &Dest = Inst.getOperand(0);
8733 MachineOperand &Src0 = Inst.getOperand(1);
8734 MachineOperand &Src1 = Inst.getOperand(2);
8735
8736 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8737 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8738
8739 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8740 .add(Src1);
8741
8742 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8743 .add(Src0)
8744 .addReg(Interm);
8745
8746 Worklist.insert(&Not);
8747 Worklist.insert(&Op);
8748
8749 MRI.replaceRegWith(Dest.getReg(), NewDest);
8750 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8751}
8752
8753void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8754 MachineInstr &Inst, unsigned Opcode,
8755 bool Swap) const {
8756 MachineBasicBlock &MBB = *Inst.getParent();
8757 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8758
8759 MachineOperand &Dest = Inst.getOperand(0);
8760 MachineOperand &Src0 = Inst.getOperand(1);
8761 const DebugLoc &DL = Inst.getDebugLoc();
8762
8763 MachineBasicBlock::iterator MII = Inst;
8764
8765 const MCInstrDesc &InstDesc = get(Opcode);
8766 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8767 MRI.getRegClass(Src0.getReg()) :
8768 &AMDGPU::SGPR_32RegClass;
8769
8770 const TargetRegisterClass *Src0SubRC =
8771 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8772
8773 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8774 AMDGPU::sub0, Src0SubRC);
8775
8776 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8777 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8778 const TargetRegisterClass *NewDestSubRC =
8779 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8780
8781 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8782 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8783
8784 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8785 AMDGPU::sub1, Src0SubRC);
8786
8787 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8788 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8789
8790 if (Swap)
8791 std::swap(DestSub0, DestSub1);
8792
8793 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8794 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8795 .addReg(DestSub0)
8796 .addImm(AMDGPU::sub0)
8797 .addReg(DestSub1)
8798 .addImm(AMDGPU::sub1);
8799
8800 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8801
8802 Worklist.insert(&LoHalf);
8803 Worklist.insert(&HiHalf);
8804
8805 // We don't need to legalizeOperands here because for a single operand, src0
8806 // will support any kind of input.
8807
8808 // Move all users of this moved value.
8809 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8810}
8811
8812// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8813// split the s_mul_u64 in 32-bit vector multiplications.
8814void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8815 MachineInstr &Inst,
8816 MachineDominatorTree *MDT) const {
8817 MachineBasicBlock &MBB = *Inst.getParent();
8818 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8819
8820 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8821 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8822 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8823
8824 MachineOperand &Dest = Inst.getOperand(0);
8825 MachineOperand &Src0 = Inst.getOperand(1);
8826 MachineOperand &Src1 = Inst.getOperand(2);
8827 const DebugLoc &DL = Inst.getDebugLoc();
8828 MachineBasicBlock::iterator MII = Inst;
8829
8830 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8831 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8832 const TargetRegisterClass *Src0SubRC =
8833 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8834 if (RI.isSGPRClass(Src0SubRC))
8835 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8836 const TargetRegisterClass *Src1SubRC =
8837 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8838 if (RI.isSGPRClass(Src1SubRC))
8839 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8840
8841 // First, we extract the low 32-bit and high 32-bit values from each of the
8842 // operands.
8843 MachineOperand Op0L =
8844 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8845 MachineOperand Op1L =
8846 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8847 MachineOperand Op0H =
8848 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8849 MachineOperand Op1H =
8850 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8851
8852 // The multilication is done as follows:
8853 //
8854 // Op1H Op1L
8855 // * Op0H Op0L
8856 // --------------------
8857 // Op1H*Op0L Op1L*Op0L
8858 // + Op1H*Op0H Op1L*Op0H
8859 // -----------------------------------------
8860 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8861 //
8862 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8863 // value and that would overflow.
8864 // The low 32-bit value is Op1L*Op0L.
8865 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8866
8867 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8868 MachineInstr *Op1L_Op0H =
8869 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8870 .add(Op1L)
8871 .add(Op0H);
8872
8873 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8874 MachineInstr *Op1H_Op0L =
8875 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8876 .add(Op1H)
8877 .add(Op0L);
8878
8879 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8880 MachineInstr *Carry =
8881 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8882 .add(Op1L)
8883 .add(Op0L);
8884
8885 MachineInstr *LoHalf =
8886 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8887 .add(Op1L)
8888 .add(Op0L);
8889
8890 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8891 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8892 .addReg(Op1L_Op0H_Reg)
8893 .addReg(Op1H_Op0L_Reg);
8894
8895 MachineInstr *HiHalf =
8896 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8897 .addReg(AddReg)
8898 .addReg(CarryReg);
8899
8900 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8901 .addReg(DestSub0)
8902 .addImm(AMDGPU::sub0)
8903 .addReg(DestSub1)
8904 .addImm(AMDGPU::sub1);
8905
8906 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8907
8908 // Try to legalize the operands in case we need to swap the order to keep it
8909 // valid.
8910 legalizeOperands(*Op1L_Op0H, MDT);
8911 legalizeOperands(*Op1H_Op0L, MDT);
8912 legalizeOperands(*Carry, MDT);
8913 legalizeOperands(*LoHalf, MDT);
8914 legalizeOperands(*Add, MDT);
8915 legalizeOperands(*HiHalf, MDT);
8916
8917 // Move all users of this moved value.
8918 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8919}
8920
8921// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8922// multiplications.
8923void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8924 MachineInstr &Inst,
8925 MachineDominatorTree *MDT) const {
8926 MachineBasicBlock &MBB = *Inst.getParent();
8927 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8928
8929 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8930 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8931 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8932
8933 MachineOperand &Dest = Inst.getOperand(0);
8934 MachineOperand &Src0 = Inst.getOperand(1);
8935 MachineOperand &Src1 = Inst.getOperand(2);
8936 const DebugLoc &DL = Inst.getDebugLoc();
8937 MachineBasicBlock::iterator MII = Inst;
8938
8939 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8940 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8941 const TargetRegisterClass *Src0SubRC =
8942 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8943 if (RI.isSGPRClass(Src0SubRC))
8944 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8945 const TargetRegisterClass *Src1SubRC =
8946 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8947 if (RI.isSGPRClass(Src1SubRC))
8948 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8949
8950 // First, we extract the low 32-bit and high 32-bit values from each of the
8951 // operands.
8952 MachineOperand Op0L =
8953 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8954 MachineOperand Op1L =
8955 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8956
8957 unsigned Opc = Inst.getOpcode();
8958 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8959 ? AMDGPU::V_MUL_HI_U32_e64
8960 : AMDGPU::V_MUL_HI_I32_e64;
8961 MachineInstr *HiHalf =
8962 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8963
8964 MachineInstr *LoHalf =
8965 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8966 .add(Op1L)
8967 .add(Op0L);
8968
8969 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8970 .addReg(DestSub0)
8971 .addImm(AMDGPU::sub0)
8972 .addReg(DestSub1)
8973 .addImm(AMDGPU::sub1);
8974
8975 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8976
8977 // Try to legalize the operands in case we need to swap the order to keep it
8978 // valid.
8979 legalizeOperands(*HiHalf, MDT);
8980 legalizeOperands(*LoHalf, MDT);
8981
8982 // Move all users of this moved value.
8983 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8984}
8985
8986void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8987 MachineInstr &Inst, unsigned Opcode,
8988 MachineDominatorTree *MDT) const {
8989 MachineBasicBlock &MBB = *Inst.getParent();
8990 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8991
8992 MachineOperand &Dest = Inst.getOperand(0);
8993 MachineOperand &Src0 = Inst.getOperand(1);
8994 MachineOperand &Src1 = Inst.getOperand(2);
8995 const DebugLoc &DL = Inst.getDebugLoc();
8996
8997 MachineBasicBlock::iterator MII = Inst;
8998
8999 const MCInstrDesc &InstDesc = get(Opcode);
9000 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9001 MRI.getRegClass(Src0.getReg()) :
9002 &AMDGPU::SGPR_32RegClass;
9003
9004 const TargetRegisterClass *Src0SubRC =
9005 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9006 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9007 MRI.getRegClass(Src1.getReg()) :
9008 &AMDGPU::SGPR_32RegClass;
9009
9010 const TargetRegisterClass *Src1SubRC =
9011 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9012
9013 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9014 AMDGPU::sub0, Src0SubRC);
9015 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9016 AMDGPU::sub0, Src1SubRC);
9017 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9018 AMDGPU::sub1, Src0SubRC);
9019 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9020 AMDGPU::sub1, Src1SubRC);
9021
9022 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9023 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9024 const TargetRegisterClass *NewDestSubRC =
9025 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9026
9027 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9028 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9029 .add(SrcReg0Sub0)
9030 .add(SrcReg1Sub0);
9031
9032 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9033 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9034 .add(SrcReg0Sub1)
9035 .add(SrcReg1Sub1);
9036
9037 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9038 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9039 .addReg(DestSub0)
9040 .addImm(AMDGPU::sub0)
9041 .addReg(DestSub1)
9042 .addImm(AMDGPU::sub1);
9043
9044 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9045
9046 Worklist.insert(&LoHalf);
9047 Worklist.insert(&HiHalf);
9048
9049 // Move all users of this moved value.
9050 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9051}
9052
9053void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9054 MachineInstr &Inst,
9055 MachineDominatorTree *MDT) const {
9056 MachineBasicBlock &MBB = *Inst.getParent();
9057 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9058
9059 MachineOperand &Dest = Inst.getOperand(0);
9060 MachineOperand &Src0 = Inst.getOperand(1);
9061 MachineOperand &Src1 = Inst.getOperand(2);
9062 const DebugLoc &DL = Inst.getDebugLoc();
9063
9064 MachineBasicBlock::iterator MII = Inst;
9065
9066 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9067
9068 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9069
9070 MachineOperand* Op0;
9071 MachineOperand* Op1;
9072
9073 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9074 Op0 = &Src0;
9075 Op1 = &Src1;
9076 } else {
9077 Op0 = &Src1;
9078 Op1 = &Src0;
9079 }
9080
9081 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9082 .add(*Op0);
9083
9084 Register NewDest = MRI.createVirtualRegister(DestRC);
9085
9086 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9087 .addReg(Interm)
9088 .add(*Op1);
9089
9090 MRI.replaceRegWith(Dest.getReg(), NewDest);
9091
9092 Worklist.insert(&Xor);
9093}
9094
9095void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9096 MachineInstr &Inst) const {
9097 MachineBasicBlock &MBB = *Inst.getParent();
9098 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9099
9100 MachineBasicBlock::iterator MII = Inst;
9101 const DebugLoc &DL = Inst.getDebugLoc();
9102
9103 MachineOperand &Dest = Inst.getOperand(0);
9104 MachineOperand &Src = Inst.getOperand(1);
9105
9106 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9107 const TargetRegisterClass *SrcRC = Src.isReg() ?
9108 MRI.getRegClass(Src.getReg()) :
9109 &AMDGPU::SGPR_32RegClass;
9110
9111 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9112 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9113
9114 const TargetRegisterClass *SrcSubRC =
9115 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9116
9117 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9118 AMDGPU::sub0, SrcSubRC);
9119 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9120 AMDGPU::sub1, SrcSubRC);
9121
9122 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9123
9124 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9125
9126 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9127
9128 // We don't need to legalize operands here. src0 for either instruction can be
9129 // an SGPR, and the second input is unused or determined here.
9130 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9131}
9132
9133void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9134 MachineInstr &Inst) const {
9135 MachineBasicBlock &MBB = *Inst.getParent();
9136 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9137 MachineBasicBlock::iterator MII = Inst;
9138 const DebugLoc &DL = Inst.getDebugLoc();
9139
9140 MachineOperand &Dest = Inst.getOperand(0);
9141 uint32_t Imm = Inst.getOperand(2).getImm();
9142 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9143 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9144
9145 (void) Offset;
9146
9147 // Only sext_inreg cases handled.
9148 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9149 Offset == 0 && "Not implemented");
9150
9151 if (BitWidth < 32) {
9152 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9153 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9154 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9155
9156 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9157 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
9158 .addImm(0)
9159 .addImm(BitWidth);
9160
9161 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9162 .addImm(31)
9163 .addReg(MidRegLo);
9164
9165 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9166 .addReg(MidRegLo)
9167 .addImm(AMDGPU::sub0)
9168 .addReg(MidRegHi)
9169 .addImm(AMDGPU::sub1);
9170
9171 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9172 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9173 return;
9174 }
9175
9176 MachineOperand &Src = Inst.getOperand(1);
9177 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9178 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9179
9180 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9181 .addImm(31)
9182 .addReg(Src.getReg(), 0, AMDGPU::sub0);
9183
9184 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9185 .addReg(Src.getReg(), 0, AMDGPU::sub0)
9186 .addImm(AMDGPU::sub0)
9187 .addReg(TmpReg)
9188 .addImm(AMDGPU::sub1);
9189
9190 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9191 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9192}
9193
9194void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9195 MachineInstr &Inst, unsigned Opcode,
9196 MachineDominatorTree *MDT) const {
9197 // (S_FLBIT_I32_B64 hi:lo) ->
9198 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9199 // (S_FF1_I32_B64 hi:lo) ->
9200 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9201
9202 MachineBasicBlock &MBB = *Inst.getParent();
9203 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9204 MachineBasicBlock::iterator MII = Inst;
9205 const DebugLoc &DL = Inst.getDebugLoc();
9206
9207 MachineOperand &Dest = Inst.getOperand(0);
9208 MachineOperand &Src = Inst.getOperand(1);
9209
9210 const MCInstrDesc &InstDesc = get(Opcode);
9211
9212 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9213 unsigned OpcodeAdd =
9214 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
9215
9216 const TargetRegisterClass *SrcRC =
9217 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9218 const TargetRegisterClass *SrcSubRC =
9219 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9220
9221 MachineOperand SrcRegSub0 =
9222 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9223 MachineOperand SrcRegSub1 =
9224 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9225
9226 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9227 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9228 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9229 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9230
9231 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9232
9233 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9234
9235 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9236 .addReg(IsCtlz ? MidReg1 : MidReg2)
9237 .addImm(32)
9238 .addImm(1); // enable clamp
9239
9240 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9241 .addReg(MidReg3)
9242 .addReg(IsCtlz ? MidReg2 : MidReg1);
9243
9244 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9245
9246 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9247}
9248
9249void SIInstrInfo::addUsersToMoveToVALUWorklist(
9251 SIInstrWorklist &Worklist) const {
9252 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9253 MachineInstr &UseMI = *MO.getParent();
9254
9255 unsigned OpNo = 0;
9256
9257 switch (UseMI.getOpcode()) {
9258 case AMDGPU::COPY:
9259 case AMDGPU::WQM:
9260 case AMDGPU::SOFT_WQM:
9261 case AMDGPU::STRICT_WWM:
9262 case AMDGPU::STRICT_WQM:
9263 case AMDGPU::REG_SEQUENCE:
9264 case AMDGPU::PHI:
9265 case AMDGPU::INSERT_SUBREG:
9266 break;
9267 default:
9268 OpNo = MO.getOperandNo();
9269 break;
9270 }
9271
9272 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9273 MRI.constrainRegClass(DstReg, OpRC);
9274
9275 if (!RI.hasVectorRegisters(OpRC))
9276 Worklist.insert(&UseMI);
9277 else
9278 // Legalization could change user list.
9280 }
9281}
9282
9283void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9285 MachineInstr &Inst) const {
9286 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9287 MachineBasicBlock *MBB = Inst.getParent();
9288 MachineOperand &Src0 = Inst.getOperand(1);
9289 MachineOperand &Src1 = Inst.getOperand(2);
9290 const DebugLoc &DL = Inst.getDebugLoc();
9291
9292 if (ST.useRealTrue16Insts()) {
9293 Register SrcReg0, SrcReg1;
9294 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9295 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9296 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
9297 } else {
9298 SrcReg0 = Src0.getReg();
9299 }
9300
9301 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9302 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9303 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
9304 } else {
9305 SrcReg1 = Src1.getReg();
9306 }
9307
9308 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9309 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9310
9311 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9312 switch (Inst.getOpcode()) {
9313 case AMDGPU::S_PACK_LL_B32_B16:
9314 NewMI
9315 .addReg(SrcReg0, 0,
9316 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9317 .addImm(AMDGPU::lo16)
9318 .addReg(SrcReg1, 0,
9319 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9320 .addImm(AMDGPU::hi16);
9321 break;
9322 case AMDGPU::S_PACK_LH_B32_B16:
9323 NewMI
9324 .addReg(SrcReg0, 0,
9325 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9326 .addImm(AMDGPU::lo16)
9327 .addReg(SrcReg1, 0, AMDGPU::hi16)
9328 .addImm(AMDGPU::hi16);
9329 break;
9330 case AMDGPU::S_PACK_HL_B32_B16:
9331 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9332 .addImm(AMDGPU::lo16)
9333 .addReg(SrcReg1, 0,
9334 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9335 .addImm(AMDGPU::hi16);
9336 break;
9337 case AMDGPU::S_PACK_HH_B32_B16:
9338 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9339 .addImm(AMDGPU::lo16)
9340 .addReg(SrcReg1, 0, AMDGPU::hi16)
9341 .addImm(AMDGPU::hi16);
9342 break;
9343 default:
9344 llvm_unreachable("unhandled s_pack_* instruction");
9345 }
9346
9347 MachineOperand &Dest = Inst.getOperand(0);
9348 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9349 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9350 return;
9351 }
9352
9353 switch (Inst.getOpcode()) {
9354 case AMDGPU::S_PACK_LL_B32_B16: {
9355 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9356 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9357
9358 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9359 // 0.
9360 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9361 .addImm(0xffff);
9362
9363 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9364 .addReg(ImmReg, RegState::Kill)
9365 .add(Src0);
9366
9367 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9368 .add(Src1)
9369 .addImm(16)
9370 .addReg(TmpReg, RegState::Kill);
9371 break;
9372 }
9373 case AMDGPU::S_PACK_LH_B32_B16: {
9374 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9375 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9376 .addImm(0xffff);
9377 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9378 .addReg(ImmReg, RegState::Kill)
9379 .add(Src0)
9380 .add(Src1);
9381 break;
9382 }
9383 case AMDGPU::S_PACK_HL_B32_B16: {
9384 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9385 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9386 .addImm(16)
9387 .add(Src0);
9388 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9389 .add(Src1)
9390 .addImm(16)
9391 .addReg(TmpReg, RegState::Kill);
9392 break;
9393 }
9394 case AMDGPU::S_PACK_HH_B32_B16: {
9395 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9396 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9397 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9398 .addImm(16)
9399 .add(Src0);
9400 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9401 .addImm(0xffff0000);
9402 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9403 .add(Src1)
9404 .addReg(ImmReg, RegState::Kill)
9405 .addReg(TmpReg, RegState::Kill);
9406 break;
9407 }
9408 default:
9409 llvm_unreachable("unhandled s_pack_* instruction");
9410 }
9411
9412 MachineOperand &Dest = Inst.getOperand(0);
9413 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9414 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9415}
9416
9417void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9418 MachineInstr &SCCDefInst,
9419 SIInstrWorklist &Worklist,
9420 Register NewCond) const {
9421
9422 // Ensure that def inst defines SCC, which is still live.
9423 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9424 !Op.isDead() && Op.getParent() == &SCCDefInst);
9425 SmallVector<MachineInstr *, 4> CopyToDelete;
9426 // This assumes that all the users of SCC are in the same block
9427 // as the SCC def.
9428 for (MachineInstr &MI : // Skip the def inst itself.
9429 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9430 SCCDefInst.getParent()->end())) {
9431 // Check if SCC is used first.
9432 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9433 if (SCCIdx != -1) {
9434 if (MI.isCopy()) {
9435 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9436 Register DestReg = MI.getOperand(0).getReg();
9437
9438 MRI.replaceRegWith(DestReg, NewCond);
9439 CopyToDelete.push_back(&MI);
9440 } else {
9441
9442 if (NewCond.isValid())
9443 MI.getOperand(SCCIdx).setReg(NewCond);
9444
9445 Worklist.insert(&MI);
9446 }
9447 }
9448 // Exit if we find another SCC def.
9449 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9450 break;
9451 }
9452 for (auto &Copy : CopyToDelete)
9453 Copy->eraseFromParent();
9454}
9455
9456// Instructions that use SCC may be converted to VALU instructions. When that
9457// happens, the SCC register is changed to VCC_LO. The instruction that defines
9458// SCC must be changed to an instruction that defines VCC. This function makes
9459// sure that the instruction that defines SCC is added to the moveToVALU
9460// worklist.
9461void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9462 SIInstrWorklist &Worklist) const {
9463 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9464 // then there is nothing to do because the defining instruction has been
9465 // converted to a VALU already. If SCC then that instruction needs to be
9466 // converted to a VALU.
9467 for (MachineInstr &MI :
9468 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9469 SCCUseInst->getParent()->rend())) {
9470 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9471 break;
9472 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9473 Worklist.insert(&MI);
9474 break;
9475 }
9476 }
9477}
9478
9479const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9480 const MachineInstr &Inst) const {
9481 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9482
9483 switch (Inst.getOpcode()) {
9484 // For target instructions, getOpRegClass just returns the virtual register
9485 // class associated with the operand, so we need to find an equivalent VGPR
9486 // register class in order to move the instruction to the VALU.
9487 case AMDGPU::COPY:
9488 case AMDGPU::PHI:
9489 case AMDGPU::REG_SEQUENCE:
9490 case AMDGPU::INSERT_SUBREG:
9491 case AMDGPU::WQM:
9492 case AMDGPU::SOFT_WQM:
9493 case AMDGPU::STRICT_WWM:
9494 case AMDGPU::STRICT_WQM: {
9495 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9496 if (RI.isAGPRClass(SrcRC)) {
9497 if (RI.isAGPRClass(NewDstRC))
9498 return nullptr;
9499
9500 switch (Inst.getOpcode()) {
9501 case AMDGPU::PHI:
9502 case AMDGPU::REG_SEQUENCE:
9503 case AMDGPU::INSERT_SUBREG:
9504 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9505 break;
9506 default:
9507 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9508 }
9509
9510 if (!NewDstRC)
9511 return nullptr;
9512 } else {
9513 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9514 return nullptr;
9515
9516 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9517 if (!NewDstRC)
9518 return nullptr;
9519 }
9520
9521 return NewDstRC;
9522 }
9523 default:
9524 return NewDstRC;
9525 }
9526}
9527
9528// Find the one SGPR operand we are allowed to use.
9529Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9530 int OpIndices[3]) const {
9531 const MCInstrDesc &Desc = MI.getDesc();
9532
9533 // Find the one SGPR operand we are allowed to use.
9534 //
9535 // First we need to consider the instruction's operand requirements before
9536 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9537 // of VCC, but we are still bound by the constant bus requirement to only use
9538 // one.
9539 //
9540 // If the operand's class is an SGPR, we can never move it.
9541
9542 Register SGPRReg = findImplicitSGPRRead(MI);
9543 if (SGPRReg)
9544 return SGPRReg;
9545
9546 Register UsedSGPRs[3] = {Register()};
9547 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9548
9549 for (unsigned i = 0; i < 3; ++i) {
9550 int Idx = OpIndices[i];
9551 if (Idx == -1)
9552 break;
9553
9554 const MachineOperand &MO = MI.getOperand(Idx);
9555 if (!MO.isReg())
9556 continue;
9557
9558 // Is this operand statically required to be an SGPR based on the operand
9559 // constraints?
9560 const TargetRegisterClass *OpRC =
9561 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9562 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9563 if (IsRequiredSGPR)
9564 return MO.getReg();
9565
9566 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9567 Register Reg = MO.getReg();
9568 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9569 if (RI.isSGPRClass(RegRC))
9570 UsedSGPRs[i] = Reg;
9571 }
9572
9573 // We don't have a required SGPR operand, so we have a bit more freedom in
9574 // selecting operands to move.
9575
9576 // Try to select the most used SGPR. If an SGPR is equal to one of the
9577 // others, we choose that.
9578 //
9579 // e.g.
9580 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9581 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9582
9583 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9584 // prefer those.
9585
9586 if (UsedSGPRs[0]) {
9587 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9588 SGPRReg = UsedSGPRs[0];
9589 }
9590
9591 if (!SGPRReg && UsedSGPRs[1]) {
9592 if (UsedSGPRs[1] == UsedSGPRs[2])
9593 SGPRReg = UsedSGPRs[1];
9594 }
9595
9596 return SGPRReg;
9597}
9598
9600 AMDGPU::OpName OperandName) const {
9601 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9602 return nullptr;
9603
9604 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9605 if (Idx == -1)
9606 return nullptr;
9607
9608 return &MI.getOperand(Idx);
9609}
9610
9612 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9613 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9616 return (Format << 44) |
9617 (1ULL << 56) | // RESOURCE_LEVEL = 1
9618 (3ULL << 60); // OOB_SELECT = 3
9619 }
9620
9621 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9622 if (ST.isAmdHsaOS()) {
9623 // Set ATC = 1. GFX9 doesn't have this bit.
9624 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9625 RsrcDataFormat |= (1ULL << 56);
9626
9627 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9628 // BTW, it disables TC L2 and therefore decreases performance.
9629 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9630 RsrcDataFormat |= (2ULL << 59);
9631 }
9632
9633 return RsrcDataFormat;
9634}
9635
9639 0xffffffff; // Size;
9640
9641 // GFX9 doesn't have ELEMENT_SIZE.
9642 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9643 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9644 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9645 }
9646
9647 // IndexStride = 64 / 32.
9648 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9649 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9650
9651 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9652 // Clear them unless we want a huge stride.
9653 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9654 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9655 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9656
9657 return Rsrc23;
9658}
9659
9661 unsigned Opc = MI.getOpcode();
9662
9663 return isSMRD(Opc);
9664}
9665
9667 return get(Opc).mayLoad() &&
9668 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9669}
9670
9672 int &FrameIndex) const {
9673 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9674 if (!Addr || !Addr->isFI())
9675 return Register();
9676
9677 assert(!MI.memoperands_empty() &&
9678 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9679
9680 FrameIndex = Addr->getIndex();
9681 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9682}
9683
9685 int &FrameIndex) const {
9686 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9687 assert(Addr && Addr->isFI());
9688 FrameIndex = Addr->getIndex();
9689 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9690}
9691
9693 int &FrameIndex) const {
9694 if (!MI.mayLoad())
9695 return Register();
9696
9697 if (isMUBUF(MI) || isVGPRSpill(MI))
9698 return isStackAccess(MI, FrameIndex);
9699
9700 if (isSGPRSpill(MI))
9701 return isSGPRStackAccess(MI, FrameIndex);
9702
9703 return Register();
9704}
9705
9707 int &FrameIndex) const {
9708 if (!MI.mayStore())
9709 return Register();
9710
9711 if (isMUBUF(MI) || isVGPRSpill(MI))
9712 return isStackAccess(MI, FrameIndex);
9713
9714 if (isSGPRSpill(MI))
9715 return isSGPRStackAccess(MI, FrameIndex);
9716
9717 return Register();
9718}
9719
9721 unsigned Size = 0;
9723 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9724 while (++I != E && I->isInsideBundle()) {
9725 assert(!I->isBundle() && "No nested bundle!");
9727 }
9728
9729 return Size;
9730}
9731
9733 unsigned Opc = MI.getOpcode();
9735 unsigned DescSize = Desc.getSize();
9736
9737 // If we have a definitive size, we can use it. Otherwise we need to inspect
9738 // the operands to know the size.
9739 if (isFixedSize(MI)) {
9740 unsigned Size = DescSize;
9741
9742 // If we hit the buggy offset, an extra nop will be inserted in MC so
9743 // estimate the worst case.
9744 if (MI.isBranch() && ST.hasOffset3fBug())
9745 Size += 4;
9746
9747 return Size;
9748 }
9749
9750 // Instructions may have a 32-bit literal encoded after them. Check
9751 // operands that could ever be literals.
9752 if (isVALU(MI) || isSALU(MI)) {
9753 if (isDPP(MI))
9754 return DescSize;
9755 bool HasLiteral = false;
9756 unsigned LiteralSize = 4;
9757 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9758 const MachineOperand &Op = MI.getOperand(I);
9759 const MCOperandInfo &OpInfo = Desc.operands()[I];
9760 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9761 HasLiteral = true;
9762 if (ST.has64BitLiterals()) {
9763 switch (OpInfo.OperandType) {
9764 default:
9765 break;
9767 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9768 LiteralSize = 8;
9769 break;
9771 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9772 LiteralSize = 8;
9773 break;
9774 }
9775 }
9776 break;
9777 }
9778 }
9779 return HasLiteral ? DescSize + LiteralSize : DescSize;
9780 }
9781
9782 // Check whether we have extra NSA words.
9783 if (isMIMG(MI)) {
9784 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9785 if (VAddr0Idx < 0)
9786 return 8;
9787
9788 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9789 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9790 }
9791
9792 switch (Opc) {
9793 case TargetOpcode::BUNDLE:
9794 return getInstBundleSize(MI);
9795 case TargetOpcode::INLINEASM:
9796 case TargetOpcode::INLINEASM_BR: {
9797 const MachineFunction *MF = MI.getMF();
9798 const char *AsmStr = MI.getOperand(0).getSymbolName();
9799 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9800 }
9801 default:
9802 if (MI.isMetaInstruction())
9803 return 0;
9804
9805 // If D16 Pseudo inst, get correct MC code size
9806 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9807 if (D16Info) {
9808 // Assume d16_lo/hi inst are always in same size
9809 unsigned LoInstOpcode = D16Info->LoOp;
9810 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9811 DescSize = Desc.getSize();
9812 }
9813
9814 // If FMA Pseudo inst, get correct MC code size
9815 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9816 // All potential lowerings are the same size; arbitrarily pick one.
9817 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9818 DescSize = Desc.getSize();
9819 }
9820
9821 return DescSize;
9822 }
9823}
9824
9826 if (!isFLAT(MI))
9827 return false;
9828
9829 if (MI.memoperands_empty())
9830 return true;
9831
9832 for (const MachineMemOperand *MMO : MI.memoperands()) {
9833 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9834 return true;
9835 }
9836 return false;
9837}
9838
9841 static const std::pair<int, const char *> TargetIndices[] = {
9842 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9843 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9844 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9845 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9846 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9847 return ArrayRef(TargetIndices);
9848}
9849
9850/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9851/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9857
9858/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9859/// pass.
9864
9865// Called during:
9866// - pre-RA scheduling and post-RA scheduling
9869 const ScheduleDAGMI *DAG) const {
9870 // Borrowed from Arm Target
9871 // We would like to restrict this hazard recognizer to only
9872 // post-RA scheduling; we can tell that we're post-RA because we don't
9873 // track VRegLiveness.
9874 if (!DAG->hasVRegLiveness())
9875 return new GCNHazardRecognizer(DAG->MF);
9877}
9878
9879std::pair<unsigned, unsigned>
9881 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9882}
9883
9886 static const std::pair<unsigned, const char *> TargetFlags[] = {
9887 {MO_GOTPCREL, "amdgpu-gotprel"},
9888 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9889 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9890 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9891 {MO_REL32_LO, "amdgpu-rel32-lo"},
9892 {MO_REL32_HI, "amdgpu-rel32-hi"},
9893 {MO_REL64, "amdgpu-rel64"},
9894 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9895 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9896 {MO_ABS64, "amdgpu-abs64"},
9897 };
9898
9899 return ArrayRef(TargetFlags);
9900}
9901
9904 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9905 {
9906 {MONoClobber, "amdgpu-noclobber"},
9907 {MOLastUse, "amdgpu-last-use"},
9908 {MOCooperative, "amdgpu-cooperative"},
9909 };
9910
9911 return ArrayRef(TargetFlags);
9912}
9913
9915 const MachineFunction &MF) const {
9917 assert(SrcReg.isVirtual());
9918 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9919 return AMDGPU::WWM_COPY;
9920
9921 return AMDGPU::COPY;
9922}
9923
9925 uint16_t Opcode = MI.getOpcode();
9926 // Check if it is SGPR spill or wwm-register spill Opcode.
9927 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
9928 return true;
9929
9930 const MachineFunction *MF = MI.getMF();
9931 const MachineRegisterInfo &MRI = MF->getRegInfo();
9933
9934 // See if this is Liverange split instruction inserted for SGPR or
9935 // wwm-register. The implicit def inserted for wwm-registers should also be
9936 // included as they can appear at the bb begin.
9937 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
9938 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
9939 return false;
9940
9941 Register Reg = MI.getOperand(0).getReg();
9942 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
9943 return IsLRSplitInst;
9944
9945 return MFI->isWWMReg(Reg);
9946}
9947
9949 Register Reg) const {
9950 // We need to handle instructions which may be inserted during register
9951 // allocation to handle the prolog. The initial prolog instruction may have
9952 // been separated from the start of the block by spills and copies inserted
9953 // needed by the prolog. However, the insertions for scalar registers can
9954 // always be placed at the BB top as they are independent of the exec mask
9955 // value.
9956 bool IsNullOrVectorRegister = true;
9957 if (Reg) {
9958 const MachineFunction *MF = MI.getMF();
9959 const MachineRegisterInfo &MRI = MF->getRegInfo();
9960 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9961 }
9962
9963 return IsNullOrVectorRegister &&
9964 (canAddToBBProlog(MI) ||
9965 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
9966 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9967}
9968
9972 const DebugLoc &DL,
9973 Register DestReg) const {
9974 if (ST.hasAddNoCarry())
9975 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9976
9977 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9978 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9979 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9980
9981 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9982 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9983}
9984
9987 const DebugLoc &DL,
9988 Register DestReg,
9989 RegScavenger &RS) const {
9990 if (ST.hasAddNoCarry())
9991 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9992
9993 // If available, prefer to use vcc.
9994 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9995 ? Register(RI.getVCC())
9996 : RS.scavengeRegisterBackwards(
9997 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9998 0, /* AllowSpill */ false);
9999
10000 // TODO: Users need to deal with this.
10001 if (!UnusedCarry.isValid())
10002 return MachineInstrBuilder();
10003
10004 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10005 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10006}
10007
10008bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10009 switch (Opcode) {
10010 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10011 case AMDGPU::SI_KILL_I1_TERMINATOR:
10012 return true;
10013 default:
10014 return false;
10015 }
10016}
10017
10019 switch (Opcode) {
10020 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10021 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10022 case AMDGPU::SI_KILL_I1_PSEUDO:
10023 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10024 default:
10025 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10026 }
10027}
10028
10029bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10030 return Imm <= getMaxMUBUFImmOffset(ST);
10031}
10032
10034 // GFX12 field is non-negative 24-bit signed byte offset.
10035 const unsigned OffsetBits =
10036 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10037 return (1 << OffsetBits) - 1;
10038}
10039
10041 if (!ST.isWave32())
10042 return;
10043
10044 if (MI.isInlineAsm())
10045 return;
10046
10047 for (auto &Op : MI.implicit_operands()) {
10048 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10049 Op.setReg(AMDGPU::VCC_LO);
10050 }
10051}
10052
10054 if (!isSMRD(MI))
10055 return false;
10056
10057 // Check that it is using a buffer resource.
10058 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10059 if (Idx == -1) // e.g. s_memtime
10060 return false;
10061
10062 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10063 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10064}
10065
10066// Given Imm, split it into the values to put into the SOffset and ImmOffset
10067// fields in an MUBUF instruction. Return false if it is not possible (due to a
10068// hardware bug needing a workaround).
10069//
10070// The required alignment ensures that individual address components remain
10071// aligned if they are aligned to begin with. It also ensures that additional
10072// offsets within the given alignment can be added to the resulting ImmOffset.
10074 uint32_t &ImmOffset, Align Alignment) const {
10075 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10076 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10077 uint32_t Overflow = 0;
10078
10079 if (Imm > MaxImm) {
10080 if (Imm <= MaxImm + 64) {
10081 // Use an SOffset inline constant for 4..64
10082 Overflow = Imm - MaxImm;
10083 Imm = MaxImm;
10084 } else {
10085 // Try to keep the same value in SOffset for adjacent loads, so that
10086 // the corresponding register contents can be re-used.
10087 //
10088 // Load values with all low-bits (except for alignment bits) set into
10089 // SOffset, so that a larger range of values can be covered using
10090 // s_movk_i32.
10091 //
10092 // Atomic operations fail to work correctly when individual address
10093 // components are unaligned, even if their sum is aligned.
10094 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10095 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10096 Imm = Low;
10097 Overflow = High - Alignment.value();
10098 }
10099 }
10100
10101 if (Overflow > 0) {
10102 // There is a hardware bug in SI and CI which prevents address clamping in
10103 // MUBUF instructions from working correctly with SOffsets. The immediate
10104 // offset is unaffected.
10105 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10106 return false;
10107
10108 // It is not possible to set immediate in SOffset field on some targets.
10109 if (ST.hasRestrictedSOffset())
10110 return false;
10111 }
10112
10113 ImmOffset = Imm;
10114 SOffset = Overflow;
10115 return true;
10116}
10117
10118// Depending on the used address space and instructions, some immediate offsets
10119// are allowed and some are not.
10120// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10121// scratch instruction offsets can also be negative. On GFX12, offsets can be
10122// negative for all variants.
10123//
10124// There are several bugs related to these offsets:
10125// On gfx10.1, flat instructions that go into the global address space cannot
10126// use an offset.
10127//
10128// For scratch instructions, the address can be either an SGPR or a VGPR.
10129// The following offsets can be used, depending on the architecture (x means
10130// cannot be used):
10131// +----------------------------+------+------+
10132// | Address-Mode | SGPR | VGPR |
10133// +----------------------------+------+------+
10134// | gfx9 | | |
10135// | negative, 4-aligned offset | x | ok |
10136// | negative, unaligned offset | x | ok |
10137// +----------------------------+------+------+
10138// | gfx10 | | |
10139// | negative, 4-aligned offset | ok | ok |
10140// | negative, unaligned offset | ok | x |
10141// +----------------------------+------+------+
10142// | gfx10.3 | | |
10143// | negative, 4-aligned offset | ok | ok |
10144// | negative, unaligned offset | ok | ok |
10145// +----------------------------+------+------+
10146//
10147// This function ignores the addressing mode, so if an offset cannot be used in
10148// one addressing mode, it is considered illegal.
10149bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10150 uint64_t FlatVariant) const {
10151 // TODO: Should 0 be special cased?
10152 if (!ST.hasFlatInstOffsets())
10153 return false;
10154
10155 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10156 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10157 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10158 return false;
10159
10160 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10161 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10162 (Offset % 4) != 0) {
10163 return false;
10164 }
10165
10166 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10167 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10168 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10169}
10170
10171// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10172std::pair<int64_t, int64_t>
10173SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10174 uint64_t FlatVariant) const {
10175 int64_t RemainderOffset = COffsetVal;
10176 int64_t ImmField = 0;
10177
10178 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10179 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10180
10181 if (AllowNegative) {
10182 // Use signed division by a power of two to truncate towards 0.
10183 int64_t D = 1LL << NumBits;
10184 RemainderOffset = (COffsetVal / D) * D;
10185 ImmField = COffsetVal - RemainderOffset;
10186
10187 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10188 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10189 (ImmField % 4) != 0) {
10190 // Make ImmField a multiple of 4
10191 RemainderOffset += ImmField % 4;
10192 ImmField -= ImmField % 4;
10193 }
10194 } else if (COffsetVal >= 0) {
10195 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10196 RemainderOffset = COffsetVal - ImmField;
10197 }
10198
10199 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10200 assert(RemainderOffset + ImmField == COffsetVal);
10201 return {ImmField, RemainderOffset};
10202}
10203
10205 if (ST.hasNegativeScratchOffsetBug() &&
10206 FlatVariant == SIInstrFlags::FlatScratch)
10207 return false;
10208
10209 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10210}
10211
10212static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10213 switch (ST.getGeneration()) {
10214 default:
10215 break;
10218 return SIEncodingFamily::SI;
10221 return SIEncodingFamily::VI;
10227 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10229 }
10230 llvm_unreachable("Unknown subtarget generation!");
10231}
10232
10233bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10234 switch(MCOp) {
10235 // These opcodes use indirect register addressing so
10236 // they need special handling by codegen (currently missing).
10237 // Therefore it is too risky to allow these opcodes
10238 // to be selected by dpp combiner or sdwa peepholer.
10239 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10240 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10241 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10242 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10243 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10244 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10245 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10246 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10247 return true;
10248 default:
10249 return false;
10250 }
10251}
10252
10253#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10254 case OPCODE##_dpp: \
10255 case OPCODE##_e32: \
10256 case OPCODE##_e64: \
10257 case OPCODE##_e64_dpp: \
10258 case OPCODE##_sdwa:
10259
10260static bool isRenamedInGFX9(int Opcode) {
10261 switch (Opcode) {
10262 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10263 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10264 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10265 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10266 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10267 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10268 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10269 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10270 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10271 //
10272 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10273 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10274 case AMDGPU::V_FMA_F16_gfx9_e64:
10275 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10276 case AMDGPU::V_INTERP_P2_F16:
10277 case AMDGPU::V_MAD_F16_e64:
10278 case AMDGPU::V_MAD_U16_e64:
10279 case AMDGPU::V_MAD_I16_e64:
10280 return true;
10281 default:
10282 return false;
10283 }
10284}
10285
10286int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10287 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10288 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10289
10290 unsigned Gen = subtargetEncodingFamily(ST);
10291
10292 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10294
10295 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10296 // subtarget has UnpackedD16VMem feature.
10297 // TODO: remove this when we discard GFX80 encoding.
10298 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10300
10301 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10302 switch (ST.getGeneration()) {
10303 default:
10305 break;
10308 break;
10311 break;
10312 }
10313 }
10314
10315 if (isMAI(Opcode)) {
10316 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10317 if (MFMAOp != -1)
10318 Opcode = MFMAOp;
10319 }
10320
10321 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10322
10323 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10325
10326 // -1 means that Opcode is already a native instruction.
10327 if (MCOp == -1)
10328 return Opcode;
10329
10330 if (ST.hasGFX90AInsts()) {
10331 uint16_t NMCOp = (uint16_t)-1;
10332 if (ST.hasGFX940Insts())
10334 if (NMCOp == (uint16_t)-1)
10336 if (NMCOp == (uint16_t)-1)
10338 if (NMCOp != (uint16_t)-1)
10339 MCOp = NMCOp;
10340 }
10341
10342 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10343 // no encoding in the given subtarget generation.
10344 if (MCOp == (uint16_t)-1)
10345 return -1;
10346
10347 if (isAsmOnlyOpcode(MCOp))
10348 return -1;
10349
10350 return MCOp;
10351}
10352
10353static
10355 assert(RegOpnd.isReg());
10356 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10357 getRegSubRegPair(RegOpnd);
10358}
10359
10362 assert(MI.isRegSequence());
10363 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10364 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10365 auto &RegOp = MI.getOperand(1 + 2 * I);
10366 return getRegOrUndef(RegOp);
10367 }
10369}
10370
10371// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10372// Following a subreg of reg:subreg isn't supported
10375 if (!RSR.SubReg)
10376 return false;
10377 switch (MI.getOpcode()) {
10378 default: break;
10379 case AMDGPU::REG_SEQUENCE:
10380 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10381 return true;
10382 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10383 case AMDGPU::INSERT_SUBREG:
10384 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10385 // inserted the subreg we're looking for
10386 RSR = getRegOrUndef(MI.getOperand(2));
10387 else { // the subreg in the rest of the reg
10388 auto R1 = getRegOrUndef(MI.getOperand(1));
10389 if (R1.SubReg) // subreg of subreg isn't supported
10390 return false;
10391 RSR.Reg = R1.Reg;
10392 }
10393 return true;
10394 }
10395 return false;
10396}
10397
10399 const MachineRegisterInfo &MRI) {
10400 assert(MRI.isSSA());
10401 if (!P.Reg.isVirtual())
10402 return nullptr;
10403
10404 auto RSR = P;
10405 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10406 while (auto *MI = DefInst) {
10407 DefInst = nullptr;
10408 switch (MI->getOpcode()) {
10409 case AMDGPU::COPY:
10410 case AMDGPU::V_MOV_B32_e32: {
10411 auto &Op1 = MI->getOperand(1);
10412 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10413 if (Op1.isUndef())
10414 return nullptr;
10415 RSR = getRegSubRegPair(Op1);
10416 DefInst = MRI.getVRegDef(RSR.Reg);
10417 }
10418 break;
10419 }
10420 default:
10421 if (followSubRegDef(*MI, RSR)) {
10422 if (!RSR.Reg)
10423 return nullptr;
10424 DefInst = MRI.getVRegDef(RSR.Reg);
10425 }
10426 }
10427 if (!DefInst)
10428 return MI;
10429 }
10430 return nullptr;
10431}
10432
10434 Register VReg,
10435 const MachineInstr &DefMI,
10436 const MachineInstr &UseMI) {
10437 assert(MRI.isSSA() && "Must be run on SSA");
10438
10439 auto *TRI = MRI.getTargetRegisterInfo();
10440 auto *DefBB = DefMI.getParent();
10441
10442 // Don't bother searching between blocks, although it is possible this block
10443 // doesn't modify exec.
10444 if (UseMI.getParent() != DefBB)
10445 return true;
10446
10447 const int MaxInstScan = 20;
10448 int NumInst = 0;
10449
10450 // Stop scan at the use.
10451 auto E = UseMI.getIterator();
10452 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10453 if (I->isDebugInstr())
10454 continue;
10455
10456 if (++NumInst > MaxInstScan)
10457 return true;
10458
10459 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10460 return true;
10461 }
10462
10463 return false;
10464}
10465
10467 Register VReg,
10468 const MachineInstr &DefMI) {
10469 assert(MRI.isSSA() && "Must be run on SSA");
10470
10471 auto *TRI = MRI.getTargetRegisterInfo();
10472 auto *DefBB = DefMI.getParent();
10473
10474 const int MaxUseScan = 10;
10475 int NumUse = 0;
10476
10477 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10478 auto &UseInst = *Use.getParent();
10479 // Don't bother searching between blocks, although it is possible this block
10480 // doesn't modify exec.
10481 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10482 return true;
10483
10484 if (++NumUse > MaxUseScan)
10485 return true;
10486 }
10487
10488 if (NumUse == 0)
10489 return false;
10490
10491 const int MaxInstScan = 20;
10492 int NumInst = 0;
10493
10494 // Stop scan when we have seen all the uses.
10495 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10496 assert(I != DefBB->end());
10497
10498 if (I->isDebugInstr())
10499 continue;
10500
10501 if (++NumInst > MaxInstScan)
10502 return true;
10503
10504 for (const MachineOperand &Op : I->operands()) {
10505 // We don't check reg masks here as they're used only on calls:
10506 // 1. EXEC is only considered const within one BB
10507 // 2. Call should be a terminator instruction if present in a BB
10508
10509 if (!Op.isReg())
10510 continue;
10511
10512 Register Reg = Op.getReg();
10513 if (Op.isUse()) {
10514 if (Reg == VReg && --NumUse == 0)
10515 return false;
10516 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10517 return true;
10518 }
10519 }
10520}
10521
10524 const DebugLoc &DL, Register Src, Register Dst) const {
10525 auto Cur = MBB.begin();
10526 if (Cur != MBB.end())
10527 do {
10528 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10529 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10530 ++Cur;
10531 } while (Cur != MBB.end() && Cur != LastPHIIt);
10532
10533 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10534 Dst);
10535}
10536
10539 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10540 if (InsPt != MBB.end() &&
10541 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10542 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10543 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10544 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10545 InsPt++;
10546 return BuildMI(MBB, InsPt, DL,
10547 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10548 .addReg(Src, 0, SrcSubReg)
10549 .addReg(AMDGPU::EXEC, RegState::Implicit);
10550 }
10551 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10552 Dst);
10553}
10554
10555bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10556
10559 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10560 VirtRegMap *VRM) const {
10561 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10562 //
10563 // %0:sreg_32 = COPY $m0
10564 //
10565 // We explicitly chose SReg_32 for the virtual register so such a copy might
10566 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10567 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10568 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10569 // TargetInstrInfo::foldMemoryOperand() is going to try.
10570 // A similar issue also exists with spilling and reloading $exec registers.
10571 //
10572 // To prevent that, constrain the %0 register class here.
10573 if (isFullCopyInstr(MI)) {
10574 Register DstReg = MI.getOperand(0).getReg();
10575 Register SrcReg = MI.getOperand(1).getReg();
10576 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10577 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10579 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10580 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10581 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10582 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10583 return nullptr;
10584 }
10585 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10586 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10587 return nullptr;
10588 }
10589 }
10590 }
10591
10592 return nullptr;
10593}
10594
10596 const MachineInstr &MI,
10597 unsigned *PredCost) const {
10598 if (MI.isBundle()) {
10600 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10601 unsigned Lat = 0, Count = 0;
10602 for (++I; I != E && I->isBundledWithPred(); ++I) {
10603 ++Count;
10604 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10605 }
10606 return Lat + Count - 1;
10607 }
10608
10609 return SchedModel.computeInstrLatency(&MI);
10610}
10611
10612const MachineOperand &
10614 if (const MachineOperand *CallAddrOp =
10615 getNamedOperand(MI, AMDGPU::OpName::src0))
10616 return *CallAddrOp;
10618}
10619
10622 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10623 unsigned Opcode = MI.getOpcode();
10624
10625 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10626 Register Dst = MI.getOperand(0).getReg();
10627 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10628 : MI.getOperand(1).getReg();
10629 LLT DstTy = MRI.getType(Dst);
10630 LLT SrcTy = MRI.getType(Src);
10631 unsigned DstAS = DstTy.getAddressSpace();
10632 unsigned SrcAS = SrcTy.getAddressSpace();
10633 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10634 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10635 ST.hasGloballyAddressableScratch()
10638 };
10639
10640 // If the target supports globally addressable scratch, the mapping from
10641 // scratch memory to the flat aperture changes therefore an address space cast
10642 // is no longer uniform.
10643 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10644 return HandleAddrSpaceCast(MI);
10645
10646 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10647 auto IID = GI->getIntrinsicID();
10652
10653 switch (IID) {
10654 case Intrinsic::amdgcn_addrspacecast_nonnull:
10655 return HandleAddrSpaceCast(MI);
10656 case Intrinsic::amdgcn_if:
10657 case Intrinsic::amdgcn_else:
10658 // FIXME: Uniform if second result
10659 break;
10660 }
10661
10663 }
10664
10665 // Loads from the private and flat address spaces are divergent, because
10666 // threads can execute the load instruction with the same inputs and get
10667 // different results.
10668 //
10669 // All other loads are not divergent, because if threads issue loads with the
10670 // same arguments, they will always get the same result.
10671 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10672 Opcode == AMDGPU::G_SEXTLOAD) {
10673 if (MI.memoperands_empty())
10674 return InstructionUniformity::NeverUniform; // conservative assumption
10675
10676 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10677 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10678 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10679 })) {
10680 // At least one MMO in a non-global address space.
10682 }
10684 }
10685
10686 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10687 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10688 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10689 AMDGPU::isGenericAtomic(Opcode)) {
10691 }
10693}
10694
10697
10698 if (isNeverUniform(MI))
10700
10701 unsigned opcode = MI.getOpcode();
10702 if (opcode == AMDGPU::V_READLANE_B32 ||
10703 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10704 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10706
10707 if (isCopyInstr(MI)) {
10708 const MachineOperand &srcOp = MI.getOperand(1);
10709 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10710 const TargetRegisterClass *regClass =
10711 RI.getPhysRegBaseClass(srcOp.getReg());
10712 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10714 }
10716 }
10717
10718 // GMIR handling
10719 if (MI.isPreISelOpcode())
10721
10722 // Atomics are divergent because they are executed sequentially: when an
10723 // atomic operation refers to the same address in each thread, then each
10724 // thread after the first sees the value written by the previous thread as
10725 // original value.
10726
10727 if (isAtomic(MI))
10729
10730 // Loads from the private and flat address spaces are divergent, because
10731 // threads can execute the load instruction with the same inputs and get
10732 // different results.
10733 if (isFLAT(MI) && MI.mayLoad()) {
10734 if (MI.memoperands_empty())
10735 return InstructionUniformity::NeverUniform; // conservative assumption
10736
10737 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10738 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10739 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10740 })) {
10741 // At least one MMO in a non-global address space.
10743 }
10744
10746 }
10747
10748 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10749 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10750
10751 // FIXME: It's conceptually broken to report this for an instruction, and not
10752 // a specific def operand. For inline asm in particular, there could be mixed
10753 // uniform and divergent results.
10754 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10755 const MachineOperand &SrcOp = MI.getOperand(I);
10756 if (!SrcOp.isReg())
10757 continue;
10758
10759 Register Reg = SrcOp.getReg();
10760 if (!Reg || !SrcOp.readsReg())
10761 continue;
10762
10763 // If RegBank is null, this is unassigned or an unallocatable special
10764 // register, which are all scalars.
10765 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10766 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10768 }
10769
10770 // TODO: Uniformity check condtions above can be rearranged for more
10771 // redability
10772
10773 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10774 // currently turned into no-op COPYs by SelectionDAG ISel and are
10775 // therefore no longer recognizable.
10776
10778}
10779
10781 switch (MF.getFunction().getCallingConv()) {
10783 return 1;
10785 return 2;
10787 return 3;
10791 const Function &F = MF.getFunction();
10792 F.getContext().diagnose(DiagnosticInfoUnsupported(
10793 F, "ds_ordered_count unsupported for this calling conv"));
10794 [[fallthrough]];
10795 }
10798 case CallingConv::C:
10799 case CallingConv::Fast:
10800 default:
10801 // Assume other calling conventions are various compute callable functions
10802 return 0;
10803 }
10804}
10805
10807 Register &SrcReg2, int64_t &CmpMask,
10808 int64_t &CmpValue) const {
10809 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10810 return false;
10811
10812 switch (MI.getOpcode()) {
10813 default:
10814 break;
10815 case AMDGPU::S_CMP_EQ_U32:
10816 case AMDGPU::S_CMP_EQ_I32:
10817 case AMDGPU::S_CMP_LG_U32:
10818 case AMDGPU::S_CMP_LG_I32:
10819 case AMDGPU::S_CMP_LT_U32:
10820 case AMDGPU::S_CMP_LT_I32:
10821 case AMDGPU::S_CMP_GT_U32:
10822 case AMDGPU::S_CMP_GT_I32:
10823 case AMDGPU::S_CMP_LE_U32:
10824 case AMDGPU::S_CMP_LE_I32:
10825 case AMDGPU::S_CMP_GE_U32:
10826 case AMDGPU::S_CMP_GE_I32:
10827 case AMDGPU::S_CMP_EQ_U64:
10828 case AMDGPU::S_CMP_LG_U64:
10829 SrcReg = MI.getOperand(0).getReg();
10830 if (MI.getOperand(1).isReg()) {
10831 if (MI.getOperand(1).getSubReg())
10832 return false;
10833 SrcReg2 = MI.getOperand(1).getReg();
10834 CmpValue = 0;
10835 } else if (MI.getOperand(1).isImm()) {
10836 SrcReg2 = Register();
10837 CmpValue = MI.getOperand(1).getImm();
10838 } else {
10839 return false;
10840 }
10841 CmpMask = ~0;
10842 return true;
10843 case AMDGPU::S_CMPK_EQ_U32:
10844 case AMDGPU::S_CMPK_EQ_I32:
10845 case AMDGPU::S_CMPK_LG_U32:
10846 case AMDGPU::S_CMPK_LG_I32:
10847 case AMDGPU::S_CMPK_LT_U32:
10848 case AMDGPU::S_CMPK_LT_I32:
10849 case AMDGPU::S_CMPK_GT_U32:
10850 case AMDGPU::S_CMPK_GT_I32:
10851 case AMDGPU::S_CMPK_LE_U32:
10852 case AMDGPU::S_CMPK_LE_I32:
10853 case AMDGPU::S_CMPK_GE_U32:
10854 case AMDGPU::S_CMPK_GE_I32:
10855 SrcReg = MI.getOperand(0).getReg();
10856 SrcReg2 = Register();
10857 CmpValue = MI.getOperand(1).getImm();
10858 CmpMask = ~0;
10859 return true;
10860 }
10861
10862 return false;
10863}
10864
10866 for (MachineBasicBlock *S : MBB->successors()) {
10867 if (S->isLiveIn(AMDGPU::SCC))
10868 return false;
10869 }
10870 return true;
10871}
10872
10873// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
10874// (incoming SCC) = !(SCC defined by SCCDef).
10875// Return true if all uses can be re-written, false otherwise.
10876bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
10877 MachineBasicBlock *MBB = SCCDef->getParent();
10878 SmallVector<MachineInstr *> InvertInstr;
10879 bool SCCIsDead = false;
10880
10881 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
10882 constexpr unsigned ScanLimit = 12;
10883 unsigned Count = 0;
10884 for (MachineInstr &MI :
10885 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
10886 if (++Count > ScanLimit)
10887 return false;
10888 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
10889 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
10890 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
10891 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10892 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
10893 InvertInstr.push_back(&MI);
10894 else
10895 return false;
10896 }
10897 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
10898 SCCIsDead = true;
10899 break;
10900 }
10901 }
10902 if (!SCCIsDead && isSCCDeadOnExit(MBB))
10903 SCCIsDead = true;
10904
10905 // SCC may have more uses. Can't invert all of them.
10906 if (!SCCIsDead)
10907 return false;
10908
10909 // Invert uses
10910 for (MachineInstr *MI : InvertInstr) {
10911 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
10912 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
10913 swapOperands(*MI);
10914 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10915 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
10916 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
10917 ? AMDGPU::S_CBRANCH_SCC1
10918 : AMDGPU::S_CBRANCH_SCC0));
10919 } else {
10920 llvm_unreachable("SCC used but no inversion handling");
10921 }
10922 }
10923 return true;
10924}
10925
10926// SCC is already valid after SCCValid.
10927// SCCRedefine will redefine SCC to the same value already available after
10928// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
10929// update kill/dead flags if necessary.
10930bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
10931 bool NeedInversion) const {
10932 MachineInstr *KillsSCC = nullptr;
10933 if (SCCValid->getParent() != SCCRedefine->getParent())
10934 return false;
10935 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
10936 SCCRedefine->getIterator())) {
10937 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
10938 return false;
10939 if (MI.killsRegister(AMDGPU::SCC, &RI))
10940 KillsSCC = &MI;
10941 }
10942 if (NeedInversion && !invertSCCUse(SCCRedefine))
10943 return false;
10944 if (MachineOperand *SccDef =
10945 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
10946 SccDef->setIsDead(false);
10947 if (KillsSCC)
10948 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
10949 SCCRedefine->eraseFromParent();
10950 return true;
10951}
10952
10953static bool foldableSelect(const MachineInstr &Def) {
10954 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
10955 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
10956 return false;
10957 bool Op1IsNonZeroImm =
10958 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
10959 bool Op2IsZeroImm =
10960 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
10961 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
10962 return false;
10963 return true;
10964}
10965
10967 Register SrcReg2, int64_t CmpMask,
10968 int64_t CmpValue,
10969 const MachineRegisterInfo *MRI) const {
10970 if (!SrcReg || SrcReg.isPhysical())
10971 return false;
10972
10973 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10974 return false;
10975
10976 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
10977 this](bool NeedInversion) -> bool {
10978 if (CmpValue != 0)
10979 return false;
10980
10981 MachineInstr *Def = MRI->getVRegDef(SrcReg);
10982 if (!Def)
10983 return false;
10984
10985 // For S_OP that set SCC = DST!=0, do the transformation
10986 //
10987 // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
10988
10989 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
10990 // for S_CSELECT* already has the same value that will be calculated by
10991 // s_cmp_lg_*
10992 //
10993 // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
10994 // imm), 0)
10995 if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
10996 return false;
10997
10998 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
10999 return false;
11000
11001 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11002 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11003 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11004 // sX = s_cselect_b64 (non-zero imm), 0
11005 // sLo = copy sX.sub0
11006 // sHi = copy sX.sub1
11007 // sY = s_or_b32 sLo, sHi
11008 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11009 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11010 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11011 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11012 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11013 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11014 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11015 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11016 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11017 Def2->getOperand(1).isReg() &&
11018 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11019 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11020 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11021 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11022 if (Select && foldableSelect(*Select))
11023 optimizeSCC(Select, Def, false);
11024 }
11025 }
11026 }
11027 return true;
11028 };
11029
11030 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11031 this](int64_t ExpectedValue, unsigned SrcSize,
11032 bool IsReversible, bool IsSigned) -> bool {
11033 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11034 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11035 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11036 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11037 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11038 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11039 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11040 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11041 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11042 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11043 //
11044 // Signed ge/gt are not used for the sign bit.
11045 //
11046 // If result of the AND is unused except in the compare:
11047 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11048 //
11049 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11050 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11051 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11052 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11053 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11054 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11055
11056 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11057 if (!Def)
11058 return false;
11059
11060 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11061 Def->getOpcode() != AMDGPU::S_AND_B64)
11062 return false;
11063
11064 int64_t Mask;
11065 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11066 if (MO->isImm())
11067 Mask = MO->getImm();
11068 else if (!getFoldableImm(MO, Mask))
11069 return false;
11070 Mask &= maxUIntN(SrcSize);
11071 return isPowerOf2_64(Mask);
11072 };
11073
11074 MachineOperand *SrcOp = &Def->getOperand(1);
11075 if (isMask(SrcOp))
11076 SrcOp = &Def->getOperand(2);
11077 else if (isMask(&Def->getOperand(2)))
11078 SrcOp = &Def->getOperand(1);
11079 else
11080 return false;
11081
11082 // A valid Mask is required to have a single bit set, hence a non-zero and
11083 // power-of-two value. This verifies that we will not do 64-bit shift below.
11084 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11085 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11086 if (IsSigned && BitNo == SrcSize - 1)
11087 return false;
11088
11089 ExpectedValue <<= BitNo;
11090
11091 bool IsReversedCC = false;
11092 if (CmpValue != ExpectedValue) {
11093 if (!IsReversible)
11094 return false;
11095 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11096 if (!IsReversedCC)
11097 return false;
11098 }
11099
11100 Register DefReg = Def->getOperand(0).getReg();
11101 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11102 return false;
11103
11104 if (!optimizeSCC(Def, &CmpInstr, false))
11105 return false;
11106
11107 if (!MRI->use_nodbg_empty(DefReg)) {
11108 assert(!IsReversedCC);
11109 return true;
11110 }
11111
11112 // Replace AND with unused result with a S_BITCMP.
11113 MachineBasicBlock *MBB = Def->getParent();
11114
11115 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11116 : AMDGPU::S_BITCMP1_B32
11117 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11118 : AMDGPU::S_BITCMP1_B64;
11119
11120 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11121 .add(*SrcOp)
11122 .addImm(BitNo);
11123 Def->eraseFromParent();
11124
11125 return true;
11126 };
11127
11128 switch (CmpInstr.getOpcode()) {
11129 default:
11130 break;
11131 case AMDGPU::S_CMP_EQ_U32:
11132 case AMDGPU::S_CMP_EQ_I32:
11133 case AMDGPU::S_CMPK_EQ_U32:
11134 case AMDGPU::S_CMPK_EQ_I32:
11135 return optimizeCmpAnd(1, 32, true, false) ||
11136 optimizeCmpSelect(/*NeedInversion=*/true);
11137 case AMDGPU::S_CMP_GE_U32:
11138 case AMDGPU::S_CMPK_GE_U32:
11139 return optimizeCmpAnd(1, 32, false, false);
11140 case AMDGPU::S_CMP_GE_I32:
11141 case AMDGPU::S_CMPK_GE_I32:
11142 return optimizeCmpAnd(1, 32, false, true);
11143 case AMDGPU::S_CMP_EQ_U64:
11144 return optimizeCmpAnd(1, 64, true, false);
11145 case AMDGPU::S_CMP_LG_U32:
11146 case AMDGPU::S_CMP_LG_I32:
11147 case AMDGPU::S_CMPK_LG_U32:
11148 case AMDGPU::S_CMPK_LG_I32:
11149 return optimizeCmpAnd(0, 32, true, false) ||
11150 optimizeCmpSelect(/*NeedInversion=*/false);
11151 case AMDGPU::S_CMP_GT_U32:
11152 case AMDGPU::S_CMPK_GT_U32:
11153 return optimizeCmpAnd(0, 32, false, false);
11154 case AMDGPU::S_CMP_GT_I32:
11155 case AMDGPU::S_CMPK_GT_I32:
11156 return optimizeCmpAnd(0, 32, false, true);
11157 case AMDGPU::S_CMP_LG_U64:
11158 return optimizeCmpAnd(0, 64, true, false) ||
11159 optimizeCmpSelect(/*NeedInversion=*/false);
11160 }
11161
11162 return false;
11163}
11164
11166 AMDGPU::OpName OpName) const {
11167 if (!ST.needsAlignedVGPRs())
11168 return;
11169
11170 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11171 if (OpNo < 0)
11172 return;
11173 MachineOperand &Op = MI.getOperand(OpNo);
11174 if (getOpSize(MI, OpNo) > 4)
11175 return;
11176
11177 // Add implicit aligned super-reg to force alignment on the data operand.
11178 const DebugLoc &DL = MI.getDebugLoc();
11179 MachineBasicBlock *BB = MI.getParent();
11181 Register DataReg = Op.getReg();
11182 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11183 Register Undef = MRI.createVirtualRegister(
11184 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11185 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11186 Register NewVR =
11187 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11188 : &AMDGPU::VReg_64_Align2RegClass);
11189 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11190 .addReg(DataReg, 0, Op.getSubReg())
11191 .addImm(AMDGPU::sub0)
11192 .addReg(Undef)
11193 .addImm(AMDGPU::sub1);
11194 Op.setReg(NewVR);
11195 Op.setSubReg(AMDGPU::sub0);
11196 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11197}
11198
11200 if (isIGLP(*MI))
11201 return false;
11202
11204}
11205
11207 if (!isWMMA(MI) && !isSWMMAC(MI))
11208 return false;
11209
11210 if (AMDGPU::isGFX1250(ST))
11211 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11212
11213 return true;
11214}
11215
11217 unsigned Opcode = MI.getOpcode();
11218
11219 if (AMDGPU::isGFX12Plus(ST))
11220 return isDOT(MI) || isXDLWMMA(MI);
11221
11222 if (!isMAI(MI) || isDGEMM(Opcode) ||
11223 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11224 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11225 return false;
11226
11227 if (!ST.hasGFX940Insts())
11228 return true;
11229
11230 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11231}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:144
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1571
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const final
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const override
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool setsSCCifResultIsNonZero(const MachineInstr &MI)
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:587
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:589
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:586
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:588
@ TI_CONSTDATA_START
Definition AMDGPU.h:585
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr unsigned getKillRegState(bool B)
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
constexpr unsigned getUndefRegState(bool B)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.