LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/MC/MCContext.h"
38
39using namespace llvm;
40
41#define DEBUG_TYPE "si-instr-info"
42
43#define GET_INSTRINFO_CTOR_DTOR
44#include "AMDGPUGenInstrInfo.inc"
45
46namespace llvm::AMDGPU {
47#define GET_D16ImageDimIntrinsics_IMPL
48#define GET_ImageDimIntrinsicTable_IMPL
49#define GET_RsrcIntrinsics_IMPL
50#include "AMDGPUGenSearchableTables.inc"
51} // namespace llvm::AMDGPU
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
68 AMDGPU::ADJCALLSTACKDOWN),
69 RI(ST), ST(ST) {
70 SchedModel.init(&ST);
71}
72
73//===----------------------------------------------------------------------===//
74// TargetInstrInfo callbacks
75//===----------------------------------------------------------------------===//
76
77static unsigned getNumOperandsNoGlue(SDNode *Node) {
78 unsigned N = Node->getNumOperands();
79 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
80 --N;
81 return N;
82}
83
84/// Returns true if both nodes have the same value for the given
85/// operand \p Op, or if both nodes do not have this operand.
87 AMDGPU::OpName OpName) {
88 unsigned Opc0 = N0->getMachineOpcode();
89 unsigned Opc1 = N1->getMachineOpcode();
90
91 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
92 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
93
94 if (Op0Idx == -1 && Op1Idx == -1)
95 return true;
96
97
98 if ((Op0Idx == -1 && Op1Idx != -1) ||
99 (Op1Idx == -1 && Op0Idx != -1))
100 return false;
101
102 // getNamedOperandIdx returns the index for the MachineInstr's operands,
103 // which includes the result as the first operand. We are indexing into the
104 // MachineSDNode's operands, so we need to skip the result operand to get
105 // the real index.
106 --Op0Idx;
107 --Op1Idx;
108
109 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
110}
111
112static bool canRemat(const MachineInstr &MI) {
113
117 return true;
118
119 if (SIInstrInfo::isSMRD(MI)) {
120 return !MI.memoperands_empty() &&
121 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
122 return MMO->isLoad() && MMO->isInvariant();
123 });
124 }
125
126 return false;
127}
128
130 const MachineInstr &MI) const {
131
132 if (canRemat(MI)) {
133 // Normally VALU use of exec would block the rematerialization, but that
134 // is OK in this case to have an implicit exec read as all VALU do.
135 // We really want all of the generic logic for this except for this.
136
137 // Another potential implicit use is mode register. The core logic of
138 // the RA will not attempt rematerialization if mode is set anywhere
139 // in the function, otherwise it is safe since mode is not changed.
140
141 // There is difference to generic method which does not allow
142 // rematerialization if there are virtual register uses. We allow this,
143 // therefore this method includes SOP instructions as well.
144 if (!MI.hasImplicitDef() &&
145 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
146 !MI.mayRaiseFPException())
147 return true;
148 }
149
151}
152
153// Returns true if the result of a VALU instruction depends on exec.
154bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
155 assert(isVALU(MI));
156
157 // If it is convergent it depends on EXEC.
158 if (MI.isConvergent())
159 return true;
160
161 // If it defines SGPR it depends on EXEC
162 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
163 for (const MachineOperand &Def : MI.defs()) {
164 if (!Def.isReg())
165 continue;
166
167 Register Reg = Def.getReg();
168 if (Reg && RI.isSGPRReg(MRI, Reg))
169 return true;
170 }
171
172 return false;
173}
174
176 // Any implicit use of exec by VALU is not a real register read.
177 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
178 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
179}
180
182 MachineBasicBlock *SuccToSinkTo,
183 MachineCycleInfo *CI) const {
184 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
185 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
186 return true;
187
188 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
189 // Check if sinking of MI would create temporal divergent use.
190 for (auto Op : MI.uses()) {
191 if (Op.isReg() && Op.getReg().isVirtual() &&
192 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
193 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
194
195 // SgprDef defined inside cycle
196 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
197 if (FromCycle == nullptr)
198 continue;
199
200 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
201 // Check if there is a FromCycle that contains SgprDef's basic block but
202 // does not contain SuccToSinkTo and also has divergent exit condition.
203 while (FromCycle && !FromCycle->contains(ToCycle)) {
205 FromCycle->getExitingBlocks(ExitingBlocks);
206
207 // FromCycle has divergent exit condition.
208 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
209 if (hasDivergentBranch(ExitingBlock))
210 return false;
211 }
212
213 FromCycle = FromCycle->getParentCycle();
214 }
215 }
216 }
217
218 return true;
219}
220
222 int64_t &Offset0,
223 int64_t &Offset1) const {
224 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
225 return false;
226
227 unsigned Opc0 = Load0->getMachineOpcode();
228 unsigned Opc1 = Load1->getMachineOpcode();
229
230 // Make sure both are actually loads.
231 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
232 return false;
233
234 // A mayLoad instruction without a def is not a load. Likely a prefetch.
235 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
236 return false;
237
238 if (isDS(Opc0) && isDS(Opc1)) {
239
240 // FIXME: Handle this case:
241 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
242 return false;
243
244 // Check base reg.
245 if (Load0->getOperand(0) != Load1->getOperand(0))
246 return false;
247
248 // Skip read2 / write2 variants for simplicity.
249 // TODO: We should report true if the used offsets are adjacent (excluded
250 // st64 versions).
251 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
252 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
253 if (Offset0Idx == -1 || Offset1Idx == -1)
254 return false;
255
256 // XXX - be careful of dataless loads
257 // getNamedOperandIdx returns the index for MachineInstrs. Since they
258 // include the output in the operand list, but SDNodes don't, we need to
259 // subtract the index by one.
260 Offset0Idx -= get(Opc0).NumDefs;
261 Offset1Idx -= get(Opc1).NumDefs;
262 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
263 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
264 return true;
265 }
266
267 if (isSMRD(Opc0) && isSMRD(Opc1)) {
268 // Skip time and cache invalidation instructions.
269 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
270 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
271 return false;
272
273 unsigned NumOps = getNumOperandsNoGlue(Load0);
274 if (NumOps != getNumOperandsNoGlue(Load1))
275 return false;
276
277 // Check base reg.
278 if (Load0->getOperand(0) != Load1->getOperand(0))
279 return false;
280
281 // Match register offsets, if both register and immediate offsets present.
282 assert(NumOps == 4 || NumOps == 5);
283 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
284 return false;
285
286 const ConstantSDNode *Load0Offset =
288 const ConstantSDNode *Load1Offset =
290
291 if (!Load0Offset || !Load1Offset)
292 return false;
293
294 Offset0 = Load0Offset->getZExtValue();
295 Offset1 = Load1Offset->getZExtValue();
296 return true;
297 }
298
299 // MUBUF and MTBUF can access the same addresses.
300 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
301
302 // MUBUF and MTBUF have vaddr at different indices.
303 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
304 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
305 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
306 return false;
307
308 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
309 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
310
311 if (OffIdx0 == -1 || OffIdx1 == -1)
312 return false;
313
314 // getNamedOperandIdx returns the index for MachineInstrs. Since they
315 // include the output in the operand list, but SDNodes don't, we need to
316 // subtract the index by one.
317 OffIdx0 -= get(Opc0).NumDefs;
318 OffIdx1 -= get(Opc1).NumDefs;
319
320 SDValue Off0 = Load0->getOperand(OffIdx0);
321 SDValue Off1 = Load1->getOperand(OffIdx1);
322
323 // The offset might be a FrameIndexSDNode.
324 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
325 return false;
326
327 Offset0 = Off0->getAsZExtVal();
328 Offset1 = Off1->getAsZExtVal();
329 return true;
330 }
331
332 return false;
333}
334
335static bool isStride64(unsigned Opc) {
336 switch (Opc) {
337 case AMDGPU::DS_READ2ST64_B32:
338 case AMDGPU::DS_READ2ST64_B64:
339 case AMDGPU::DS_WRITE2ST64_B32:
340 case AMDGPU::DS_WRITE2ST64_B64:
341 return true;
342 default:
343 return false;
344 }
345}
346
349 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
350 const TargetRegisterInfo *TRI) const {
351 if (!LdSt.mayLoadOrStore())
352 return false;
353
354 unsigned Opc = LdSt.getOpcode();
355 OffsetIsScalable = false;
356 const MachineOperand *BaseOp, *OffsetOp;
357 int DataOpIdx;
358
359 if (isDS(LdSt)) {
360 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
361 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
362 if (OffsetOp) {
363 // Normal, single offset LDS instruction.
364 if (!BaseOp) {
365 // DS_CONSUME/DS_APPEND use M0 for the base address.
366 // TODO: find the implicit use operand for M0 and use that as BaseOp?
367 return false;
368 }
369 BaseOps.push_back(BaseOp);
370 Offset = OffsetOp->getImm();
371 // Get appropriate operand, and compute width accordingly.
372 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
373 if (DataOpIdx == -1)
374 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
375 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
376 Width = LocationSize::precise(64);
377 else
378 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
379 } else {
380 // The 2 offset instructions use offset0 and offset1 instead. We can treat
381 // these as a load with a single offset if the 2 offsets are consecutive.
382 // We will use this for some partially aligned loads.
383 const MachineOperand *Offset0Op =
384 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
385 const MachineOperand *Offset1Op =
386 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
387
388 unsigned Offset0 = Offset0Op->getImm() & 0xff;
389 unsigned Offset1 = Offset1Op->getImm() & 0xff;
390 if (Offset0 + 1 != Offset1)
391 return false;
392
393 // Each of these offsets is in element sized units, so we need to convert
394 // to bytes of the individual reads.
395
396 unsigned EltSize;
397 if (LdSt.mayLoad())
398 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
399 else {
400 assert(LdSt.mayStore());
401 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
402 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
403 }
404
405 if (isStride64(Opc))
406 EltSize *= 64;
407
408 BaseOps.push_back(BaseOp);
409 Offset = EltSize * Offset0;
410 // Get appropriate operand(s), and compute width accordingly.
411 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
412 if (DataOpIdx == -1) {
413 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
414 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
415 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
416 Width = LocationSize::precise(
417 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
418 } else {
419 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
420 }
421 }
422 return true;
423 }
424
425 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
426 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
427 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
428 return false;
429 BaseOps.push_back(RSrc);
430 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
431 if (BaseOp && !BaseOp->isFI())
432 BaseOps.push_back(BaseOp);
433 const MachineOperand *OffsetImm =
434 getNamedOperand(LdSt, AMDGPU::OpName::offset);
435 Offset = OffsetImm->getImm();
436 const MachineOperand *SOffset =
437 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
438 if (SOffset) {
439 if (SOffset->isReg())
440 BaseOps.push_back(SOffset);
441 else
442 Offset += SOffset->getImm();
443 }
444 // Get appropriate operand, and compute width accordingly.
445 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
446 if (DataOpIdx == -1)
447 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
448 if (DataOpIdx == -1) // LDS DMA
449 return false;
450 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
451 return true;
452 }
453
454 if (isImage(LdSt)) {
455 auto RsrcOpName =
456 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
457 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
458 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
459 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
460 if (VAddr0Idx >= 0) {
461 // GFX10 possible NSA encoding.
462 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
463 BaseOps.push_back(&LdSt.getOperand(I));
464 } else {
465 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
466 }
467 Offset = 0;
468 // Get appropriate operand, and compute width accordingly.
469 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
470 if (DataOpIdx == -1)
471 return false; // no return sampler
472 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
473 return true;
474 }
475
476 if (isSMRD(LdSt)) {
477 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
478 if (!BaseOp) // e.g. S_MEMTIME
479 return false;
480 BaseOps.push_back(BaseOp);
481 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
482 Offset = OffsetOp ? OffsetOp->getImm() : 0;
483 // Get appropriate operand, and compute width accordingly.
484 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
485 if (DataOpIdx == -1)
486 return false;
487 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
488 return true;
489 }
490
491 if (isFLAT(LdSt)) {
492 // Instructions have either vaddr or saddr or both or none.
493 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
494 if (BaseOp)
495 BaseOps.push_back(BaseOp);
496 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
497 if (BaseOp)
498 BaseOps.push_back(BaseOp);
499 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
500 // Get appropriate operand, and compute width accordingly.
501 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
502 if (DataOpIdx == -1)
503 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
504 if (DataOpIdx == -1) // LDS DMA
505 return false;
506 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
507 return true;
508 }
509
510 return false;
511}
512
513static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
515 const MachineInstr &MI2,
517 // Only examine the first "base" operand of each instruction, on the
518 // assumption that it represents the real base address of the memory access.
519 // Other operands are typically offsets or indices from this base address.
520 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
521 return true;
522
523 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
524 return false;
525
526 auto *MO1 = *MI1.memoperands_begin();
527 auto *MO2 = *MI2.memoperands_begin();
528 if (MO1->getAddrSpace() != MO2->getAddrSpace())
529 return false;
530
531 const auto *Base1 = MO1->getValue();
532 const auto *Base2 = MO2->getValue();
533 if (!Base1 || !Base2)
534 return false;
535 Base1 = getUnderlyingObject(Base1);
536 Base2 = getUnderlyingObject(Base2);
537
538 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
539 return false;
540
541 return Base1 == Base2;
542}
543
545 int64_t Offset1, bool OffsetIsScalable1,
547 int64_t Offset2, bool OffsetIsScalable2,
548 unsigned ClusterSize,
549 unsigned NumBytes) const {
550 // If the mem ops (to be clustered) do not have the same base ptr, then they
551 // should not be clustered
552 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
553 if (!BaseOps1.empty() && !BaseOps2.empty()) {
554 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
555 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
556 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
557 return false;
558
559 const SIMachineFunctionInfo *MFI =
560 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
561 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
562 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
563 // If only one base op is empty, they do not have the same base ptr
564 return false;
565 }
566
567 // In order to avoid register pressure, on an average, the number of DWORDS
568 // loaded together by all clustered mem ops should not exceed
569 // MaxMemoryClusterDWords. This is an empirical value based on certain
570 // observations and performance related experiments.
571 // The good thing about this heuristic is - it avoids clustering of too many
572 // sub-word loads, and also avoids clustering of wide loads. Below is the
573 // brief summary of how the heuristic behaves for various `LoadSize` when
574 // MaxMemoryClusterDWords is 8.
575 //
576 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
577 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
578 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
579 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
580 // (5) LoadSize >= 17: do not cluster
581 const unsigned LoadSize = NumBytes / ClusterSize;
582 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
583 return NumDWords <= MaxMemoryClusterDWords;
584}
585
586// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
587// the first 16 loads will be interleaved with the stores, and the next 16 will
588// be clustered as expected. It should really split into 2 16 store batches.
589//
590// Loads are clustered until this returns false, rather than trying to schedule
591// groups of stores. This also means we have to deal with saying different
592// address space loads should be clustered, and ones which might cause bank
593// conflicts.
594//
595// This might be deprecated so it might not be worth that much effort to fix.
597 int64_t Offset0, int64_t Offset1,
598 unsigned NumLoads) const {
599 assert(Offset1 > Offset0 &&
600 "Second offset should be larger than first offset!");
601 // If we have less than 16 loads in a row, and the offsets are within 64
602 // bytes, then schedule together.
603
604 // A cacheline is 64 bytes (for global memory).
605 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
606}
607
610 const DebugLoc &DL, MCRegister DestReg,
611 MCRegister SrcReg, bool KillSrc,
612 const char *Msg = "illegal VGPR to SGPR copy") {
613 MachineFunction *MF = MBB.getParent();
614
616 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
617
618 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
619 .addReg(SrcReg, getKillRegState(KillSrc));
620}
621
622/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
623/// possible to have a direct copy in these cases on GFX908, so an intermediate
624/// VGPR copy is required.
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 RegScavenger &RS, bool RegsOverlap,
631 Register ImpDefSuperReg = Register(),
632 Register ImpUseSuperReg = Register()) {
633 assert((TII.getSubtarget().hasMAIInsts() &&
634 !TII.getSubtarget().hasGFX90AInsts()) &&
635 "Expected GFX908 subtarget.");
636
637 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
638 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
639 "Source register of the copy should be either an SGPR or an AGPR.");
640
641 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
642 "Destination register of the copy should be an AGPR.");
643
644 const SIRegisterInfo &RI = TII.getRegisterInfo();
645
646 // First try to find defining accvgpr_write to avoid temporary registers.
647 // In the case of copies of overlapping AGPRs, we conservatively do not
648 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
649 // an accvgpr_write used for this same copy due to implicit-defs
650 if (!RegsOverlap) {
651 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
652 --Def;
653
654 if (!Def->modifiesRegister(SrcReg, &RI))
655 continue;
656
657 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
658 Def->getOperand(0).getReg() != SrcReg)
659 break;
660
661 MachineOperand &DefOp = Def->getOperand(1);
662 assert(DefOp.isReg() || DefOp.isImm());
663
664 if (DefOp.isReg()) {
665 bool SafeToPropagate = true;
666 // Check that register source operand is not clobbered before MI.
667 // Immediate operands are always safe to propagate.
668 for (auto I = Def; I != MI && SafeToPropagate; ++I)
669 if (I->modifiesRegister(DefOp.getReg(), &RI))
670 SafeToPropagate = false;
671
672 if (!SafeToPropagate)
673 break;
674
675 for (auto I = Def; I != MI; ++I)
676 I->clearRegisterKills(DefOp.getReg(), &RI);
677 }
678
679 MachineInstrBuilder Builder =
680 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
681 .add(DefOp);
682 if (ImpDefSuperReg)
683 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
684
685 if (ImpUseSuperReg) {
686 Builder.addReg(ImpUseSuperReg,
688 }
689
690 return;
691 }
692 }
693
694 RS.enterBasicBlockEnd(MBB);
695 RS.backward(std::next(MI));
696
697 // Ideally we want to have three registers for a long reg_sequence copy
698 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
699 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
700 *MBB.getParent());
701
702 // Registers in the sequence are allocated contiguously so we can just
703 // use register number to pick one of three round-robin temps.
704 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
705 Register Tmp =
706 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
707 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
708 "VGPR used for an intermediate copy should have been reserved.");
709
710 // Only loop through if there are any free registers left. We don't want to
711 // spill.
712 while (RegNo--) {
713 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
714 /* RestoreAfter */ false, 0,
715 /* AllowSpill */ false);
716 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
717 break;
718 Tmp = Tmp2;
719 RS.setRegUsed(Tmp);
720 }
721
722 // Insert copy to temporary VGPR.
723 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
724 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
725 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
726 } else {
727 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
728 }
729
730 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
731 .addReg(SrcReg, getKillRegState(KillSrc));
732 if (ImpUseSuperReg) {
733 UseBuilder.addReg(ImpUseSuperReg,
735 }
736
737 MachineInstrBuilder DefBuilder
738 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
739 .addReg(Tmp, RegState::Kill);
740
741 if (ImpDefSuperReg)
742 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
743}
744
747 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
748 const TargetRegisterClass *RC, bool Forward) {
749 const SIRegisterInfo &RI = TII.getRegisterInfo();
750 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
752 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
753
754 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
755 int16_t SubIdx = BaseIndices[Idx];
756 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
757 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
758 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
759 unsigned Opcode = AMDGPU::S_MOV_B32;
760
761 // Is SGPR aligned? If so try to combine with next.
762 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
763 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
764 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
765 // Can use SGPR64 copy
766 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
767 SubIdx = RI.getSubRegFromChannel(Channel, 2);
768 DestSubReg = RI.getSubReg(DestReg, SubIdx);
769 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
770 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
771 Opcode = AMDGPU::S_MOV_B64;
772 Idx++;
773 }
774
775 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
776 .addReg(SrcSubReg)
777 .addReg(SrcReg, RegState::Implicit);
778
779 if (!FirstMI)
780 FirstMI = LastMI;
781
782 if (!Forward)
783 I--;
784 }
785
786 assert(FirstMI && LastMI);
787 if (!Forward)
788 std::swap(FirstMI, LastMI);
789
790 FirstMI->addOperand(
791 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
792
793 if (KillSrc)
794 LastMI->addRegisterKilled(SrcReg, &RI);
795}
796
799 const DebugLoc &DL, Register DestReg,
800 Register SrcReg, bool KillSrc, bool RenamableDest,
801 bool RenamableSrc) const {
802 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
803 unsigned Size = RI.getRegSizeInBits(*RC);
804 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
805 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
806
807 // The rest of copyPhysReg assumes Src and Dst size are the same size.
808 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
809 // we remove Fix16BitCopies and this code block?
810 if (Fix16BitCopies) {
811 if (((Size == 16) != (SrcSize == 16))) {
812 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
813 assert(ST.useRealTrue16Insts());
814 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
815 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
816 RegToFix = SubReg;
817
818 if (DestReg == SrcReg) {
819 // Identity copy. Insert empty bundle since ExpandPostRA expects an
820 // instruction here.
821 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
822 return;
823 }
824 RC = RI.getPhysRegBaseClass(DestReg);
825 Size = RI.getRegSizeInBits(*RC);
826 SrcRC = RI.getPhysRegBaseClass(SrcReg);
827 SrcSize = RI.getRegSizeInBits(*SrcRC);
828 }
829 }
830
831 if (RC == &AMDGPU::VGPR_32RegClass) {
832 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
833 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
834 AMDGPU::AGPR_32RegClass.contains(SrcReg));
835 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
836 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
837 BuildMI(MBB, MI, DL, get(Opc), DestReg)
838 .addReg(SrcReg, getKillRegState(KillSrc));
839 return;
840 }
841
842 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
843 RC == &AMDGPU::SReg_32RegClass) {
844 if (SrcReg == AMDGPU::SCC) {
845 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
846 .addImm(1)
847 .addImm(0);
848 return;
849 }
850
851 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
852 if (DestReg == AMDGPU::VCC_LO) {
853 // FIXME: Hack until VReg_1 removed.
854 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
855 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
856 .addImm(0)
857 .addReg(SrcReg, getKillRegState(KillSrc));
858 return;
859 }
860
861 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
862 return;
863 }
864
865 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
866 .addReg(SrcReg, getKillRegState(KillSrc));
867 return;
868 }
869
870 if (RC == &AMDGPU::SReg_64RegClass) {
871 if (SrcReg == AMDGPU::SCC) {
872 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
873 .addImm(1)
874 .addImm(0);
875 return;
876 }
877
878 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
879 if (DestReg == AMDGPU::VCC) {
880 // FIXME: Hack until VReg_1 removed.
881 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
882 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
883 .addImm(0)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 return;
886 }
887
888 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
889 return;
890 }
891
892 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
893 .addReg(SrcReg, getKillRegState(KillSrc));
894 return;
895 }
896
897 if (DestReg == AMDGPU::SCC) {
898 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
899 // but SelectionDAG emits such copies for i1 sources.
900 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
901 // This copy can only be produced by patterns
902 // with explicit SCC, which are known to be enabled
903 // only for subtargets with S_CMP_LG_U64 present.
904 assert(ST.hasScalarCompareEq64());
905 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
906 .addReg(SrcReg, getKillRegState(KillSrc))
907 .addImm(0);
908 } else {
909 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
910 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
911 .addReg(SrcReg, getKillRegState(KillSrc))
912 .addImm(0);
913 }
914
915 return;
916 }
917
918 if (RC == &AMDGPU::AGPR_32RegClass) {
919 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
920 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
921 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
922 .addReg(SrcReg, getKillRegState(KillSrc));
923 return;
924 }
925
926 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
927 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
928 .addReg(SrcReg, getKillRegState(KillSrc));
929 return;
930 }
931
932 // FIXME: Pass should maintain scavenger to avoid scan through the block on
933 // every AGPR spill.
934 RegScavenger RS;
935 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
936 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
937 return;
938 }
939
940 if (Size == 16) {
941 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
942 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
943 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
944
945 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
946 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
947 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
948 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
949 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
950 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
951 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
952 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
953
954 if (IsSGPRDst) {
955 if (!IsSGPRSrc) {
956 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
957 return;
958 }
959
960 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
961 .addReg(NewSrcReg, getKillRegState(KillSrc));
962 return;
963 }
964
965 if (IsAGPRDst || IsAGPRSrc) {
966 if (!DstLow || !SrcLow) {
967 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
968 "Cannot use hi16 subreg with an AGPR!");
969 }
970
971 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
972 return;
973 }
974
975 if (ST.useRealTrue16Insts()) {
976 if (IsSGPRSrc) {
977 assert(SrcLow);
978 SrcReg = NewSrcReg;
979 }
980 // Use the smaller instruction encoding if possible.
981 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
982 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
983 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
984 .addReg(SrcReg);
985 } else {
986 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
987 .addImm(0) // src0_modifiers
988 .addReg(SrcReg)
989 .addImm(0); // op_sel
990 }
991 return;
992 }
993
994 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
995 if (!DstLow || !SrcLow) {
996 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
997 "Cannot use hi16 subreg on VI!");
998 }
999
1000 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1001 .addReg(NewSrcReg, getKillRegState(KillSrc));
1002 return;
1003 }
1004
1005 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1006 .addImm(0) // src0_modifiers
1007 .addReg(NewSrcReg)
1008 .addImm(0) // clamp
1015 // First implicit operand is $exec.
1016 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1017 return;
1018 }
1019
1020 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1021 if (ST.hasVMovB64Inst()) {
1022 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1023 .addReg(SrcReg, getKillRegState(KillSrc));
1024 return;
1025 }
1026 if (ST.hasPkMovB32()) {
1027 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1029 .addReg(SrcReg)
1031 .addReg(SrcReg)
1032 .addImm(0) // op_sel_lo
1033 .addImm(0) // op_sel_hi
1034 .addImm(0) // neg_lo
1035 .addImm(0) // neg_hi
1036 .addImm(0) // clamp
1037 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1038 return;
1039 }
1040 }
1041
1042 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1043 if (RI.isSGPRClass(RC)) {
1044 if (!RI.isSGPRClass(SrcRC)) {
1045 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1046 return;
1047 }
1048 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1049 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1050 Forward);
1051 return;
1052 }
1053
1054 unsigned EltSize = 4;
1055 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1056 if (RI.isAGPRClass(RC)) {
1057 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1058 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1059 else if (RI.hasVGPRs(SrcRC) ||
1060 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1061 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1062 else
1063 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1064 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1065 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1066 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1067 (RI.isProperlyAlignedRC(*RC) &&
1068 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1069 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1070 if (ST.hasVMovB64Inst()) {
1071 Opcode = AMDGPU::V_MOV_B64_e32;
1072 EltSize = 8;
1073 } else if (ST.hasPkMovB32()) {
1074 Opcode = AMDGPU::V_PK_MOV_B32;
1075 EltSize = 8;
1076 }
1077 }
1078
1079 // For the cases where we need an intermediate instruction/temporary register
1080 // (destination is an AGPR), we need a scavenger.
1081 //
1082 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1083 // whole block for every handled copy.
1084 std::unique_ptr<RegScavenger> RS;
1085 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1086 RS = std::make_unique<RegScavenger>();
1087
1088 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1089
1090 // If there is an overlap, we can't kill the super-register on the last
1091 // instruction, since it will also kill the components made live by this def.
1092 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1093 const bool CanKillSuperReg = KillSrc && !Overlap;
1094
1095 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1096 unsigned SubIdx;
1097 if (Forward)
1098 SubIdx = SubIndices[Idx];
1099 else
1100 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1101 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1102 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1103 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1104
1105 bool IsFirstSubreg = Idx == 0;
1106 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1107
1108 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1109 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1110 Register ImpUseSuper = SrcReg;
1111 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1112 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1113 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1115 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1117 .addReg(SrcSubReg)
1119 .addReg(SrcSubReg)
1120 .addImm(0) // op_sel_lo
1121 .addImm(0) // op_sel_hi
1122 .addImm(0) // neg_lo
1123 .addImm(0) // neg_hi
1124 .addImm(0) // clamp
1125 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1126 if (IsFirstSubreg)
1128 } else {
1129 MachineInstrBuilder Builder =
1130 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1131 if (IsFirstSubreg)
1132 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1133
1134 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1135 }
1136 }
1137}
1138
1139int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1140 int32_t NewOpc;
1141
1142 // Try to map original to commuted opcode
1143 NewOpc = AMDGPU::getCommuteRev(Opcode);
1144 if (NewOpc != -1)
1145 // Check if the commuted (REV) opcode exists on the target.
1146 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1147
1148 // Try to map commuted to original opcode
1149 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1150 if (NewOpc != -1)
1151 // Check if the original (non-REV) opcode exists on the target.
1152 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1153
1154 return Opcode;
1155}
1156
1157const TargetRegisterClass *
1159 return &AMDGPU::VGPR_32RegClass;
1160}
1161
1164 const DebugLoc &DL, Register DstReg,
1166 Register TrueReg,
1167 Register FalseReg) const {
1168 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1169 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1171 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1172 "Not a VGPR32 reg");
1173
1174 if (Cond.size() == 1) {
1175 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1176 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1177 .add(Cond[0]);
1178 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1179 .addImm(0)
1180 .addReg(FalseReg)
1181 .addImm(0)
1182 .addReg(TrueReg)
1183 .addReg(SReg);
1184 } else if (Cond.size() == 2) {
1185 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1186 switch (Cond[0].getImm()) {
1187 case SIInstrInfo::SCC_TRUE: {
1188 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1189 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1190 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1191 .addImm(0)
1192 .addReg(FalseReg)
1193 .addImm(0)
1194 .addReg(TrueReg)
1195 .addReg(SReg);
1196 break;
1197 }
1198 case SIInstrInfo::SCC_FALSE: {
1199 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1200 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1201 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1202 .addImm(0)
1203 .addReg(FalseReg)
1204 .addImm(0)
1205 .addReg(TrueReg)
1206 .addReg(SReg);
1207 break;
1208 }
1209 case SIInstrInfo::VCCNZ: {
1210 MachineOperand RegOp = Cond[1];
1211 RegOp.setImplicit(false);
1212 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1213 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1214 .add(RegOp);
1215 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1216 .addImm(0)
1217 .addReg(FalseReg)
1218 .addImm(0)
1219 .addReg(TrueReg)
1220 .addReg(SReg);
1221 break;
1222 }
1223 case SIInstrInfo::VCCZ: {
1224 MachineOperand RegOp = Cond[1];
1225 RegOp.setImplicit(false);
1226 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1227 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1228 .add(RegOp);
1229 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1230 .addImm(0)
1231 .addReg(TrueReg)
1232 .addImm(0)
1233 .addReg(FalseReg)
1234 .addReg(SReg);
1235 break;
1236 }
1237 case SIInstrInfo::EXECNZ: {
1238 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1239 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1240 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1241 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1242 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1243 .addImm(0)
1244 .addReg(FalseReg)
1245 .addImm(0)
1246 .addReg(TrueReg)
1247 .addReg(SReg);
1248 break;
1249 }
1250 case SIInstrInfo::EXECZ: {
1251 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1252 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1253 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1254 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1255 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1256 .addImm(0)
1257 .addReg(FalseReg)
1258 .addImm(0)
1259 .addReg(TrueReg)
1260 .addReg(SReg);
1261 llvm_unreachable("Unhandled branch predicate EXECZ");
1262 break;
1263 }
1264 default:
1265 llvm_unreachable("invalid branch predicate");
1266 }
1267 } else {
1268 llvm_unreachable("Can only handle Cond size 1 or 2");
1269 }
1270}
1271
1274 const DebugLoc &DL,
1275 Register SrcReg, int Value) const {
1276 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1277 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1278 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1279 .addImm(Value)
1280 .addReg(SrcReg);
1281
1282 return Reg;
1283}
1284
1287 const DebugLoc &DL,
1288 Register SrcReg, int Value) const {
1289 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1290 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1291 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1292 .addImm(Value)
1293 .addReg(SrcReg);
1294
1295 return Reg;
1296}
1297
1299 const Register Reg,
1300 int64_t &ImmVal) const {
1301 switch (MI.getOpcode()) {
1302 case AMDGPU::V_MOV_B32_e32:
1303 case AMDGPU::S_MOV_B32:
1304 case AMDGPU::S_MOVK_I32:
1305 case AMDGPU::S_MOV_B64:
1306 case AMDGPU::V_MOV_B64_e32:
1307 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1308 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1309 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1310 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1311 case AMDGPU::V_MOV_B64_PSEUDO:
1312 case AMDGPU::V_MOV_B16_t16_e32: {
1313 const MachineOperand &Src0 = MI.getOperand(1);
1314 if (Src0.isImm()) {
1315 ImmVal = Src0.getImm();
1316 return MI.getOperand(0).getReg() == Reg;
1317 }
1318
1319 return false;
1320 }
1321 case AMDGPU::V_MOV_B16_t16_e64: {
1322 const MachineOperand &Src0 = MI.getOperand(2);
1323 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1324 ImmVal = Src0.getImm();
1325 return MI.getOperand(0).getReg() == Reg;
1326 }
1327
1328 return false;
1329 }
1330 case AMDGPU::S_BREV_B32:
1331 case AMDGPU::V_BFREV_B32_e32:
1332 case AMDGPU::V_BFREV_B32_e64: {
1333 const MachineOperand &Src0 = MI.getOperand(1);
1334 if (Src0.isImm()) {
1335 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1336 return MI.getOperand(0).getReg() == Reg;
1337 }
1338
1339 return false;
1340 }
1341 case AMDGPU::S_NOT_B32:
1342 case AMDGPU::V_NOT_B32_e32:
1343 case AMDGPU::V_NOT_B32_e64: {
1344 const MachineOperand &Src0 = MI.getOperand(1);
1345 if (Src0.isImm()) {
1346 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1347 return MI.getOperand(0).getReg() == Reg;
1348 }
1349
1350 return false;
1351 }
1352 default:
1353 return false;
1354 }
1355}
1356
1357std::optional<int64_t>
1359 if (Op.isImm())
1360 return Op.getImm();
1361
1362 if (!Op.isReg() || !Op.getReg().isVirtual())
1363 return std::nullopt;
1364 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1365 const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1366 if (Def && Def->isMoveImmediate()) {
1367 const MachineOperand &ImmSrc = Def->getOperand(1);
1368 if (ImmSrc.isImm())
1369 return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1370 }
1371
1372 return std::nullopt;
1373}
1374
1376
1377 if (RI.isAGPRClass(DstRC))
1378 return AMDGPU::COPY;
1379 if (RI.getRegSizeInBits(*DstRC) == 16) {
1380 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1381 // before RA.
1382 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1383 }
1384 if (RI.getRegSizeInBits(*DstRC) == 32)
1385 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1386 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1387 return AMDGPU::S_MOV_B64;
1388 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1389 return AMDGPU::V_MOV_B64_PSEUDO;
1390 return AMDGPU::COPY;
1391}
1392
1393const MCInstrDesc &
1395 bool IsIndirectSrc) const {
1396 if (IsIndirectSrc) {
1397 if (VecSize <= 32) // 4 bytes
1398 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1399 if (VecSize <= 64) // 8 bytes
1400 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1401 if (VecSize <= 96) // 12 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1403 if (VecSize <= 128) // 16 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1405 if (VecSize <= 160) // 20 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1407 if (VecSize <= 192) // 24 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1409 if (VecSize <= 224) // 28 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1411 if (VecSize <= 256) // 32 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1413 if (VecSize <= 288) // 36 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1415 if (VecSize <= 320) // 40 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1417 if (VecSize <= 352) // 44 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1419 if (VecSize <= 384) // 48 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1421 if (VecSize <= 512) // 64 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1423 if (VecSize <= 1024) // 128 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1425
1426 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1427 }
1428
1429 if (VecSize <= 32) // 4 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1431 if (VecSize <= 64) // 8 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1433 if (VecSize <= 96) // 12 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1435 if (VecSize <= 128) // 16 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1437 if (VecSize <= 160) // 20 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1439 if (VecSize <= 192) // 24 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1441 if (VecSize <= 224) // 28 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1443 if (VecSize <= 256) // 32 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1445 if (VecSize <= 288) // 36 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1447 if (VecSize <= 320) // 40 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1449 if (VecSize <= 352) // 44 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1451 if (VecSize <= 384) // 48 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1453 if (VecSize <= 512) // 64 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1455 if (VecSize <= 1024) // 128 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1457
1458 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1459}
1460
1461static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1462 if (VecSize <= 32) // 4 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1464 if (VecSize <= 64) // 8 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1466 if (VecSize <= 96) // 12 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1468 if (VecSize <= 128) // 16 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1470 if (VecSize <= 160) // 20 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1472 if (VecSize <= 192) // 24 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1474 if (VecSize <= 224) // 28 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1476 if (VecSize <= 256) // 32 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1478 if (VecSize <= 288) // 36 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1480 if (VecSize <= 320) // 40 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1482 if (VecSize <= 352) // 44 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1484 if (VecSize <= 384) // 48 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1486 if (VecSize <= 512) // 64 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1488 if (VecSize <= 1024) // 128 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1490
1491 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1492}
1493
1494static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1495 if (VecSize <= 32) // 4 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1497 if (VecSize <= 64) // 8 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1499 if (VecSize <= 96) // 12 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1501 if (VecSize <= 128) // 16 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1503 if (VecSize <= 160) // 20 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1505 if (VecSize <= 192) // 24 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1507 if (VecSize <= 224) // 28 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1509 if (VecSize <= 256) // 32 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1511 if (VecSize <= 288) // 36 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1513 if (VecSize <= 320) // 40 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1515 if (VecSize <= 352) // 44 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1517 if (VecSize <= 384) // 48 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1519 if (VecSize <= 512) // 64 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1521 if (VecSize <= 1024) // 128 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1523
1524 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1525}
1526
1527static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1528 if (VecSize <= 64) // 8 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1530 if (VecSize <= 128) // 16 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1532 if (VecSize <= 256) // 32 bytes
1533 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1534 if (VecSize <= 512) // 64 bytes
1535 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1536 if (VecSize <= 1024) // 128 bytes
1537 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1538
1539 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1540}
1541
1542const MCInstrDesc &
1543SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1544 bool IsSGPR) const {
1545 if (IsSGPR) {
1546 switch (EltSize) {
1547 case 32:
1548 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1549 case 64:
1550 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1551 default:
1552 llvm_unreachable("invalid reg indexing elt size");
1553 }
1554 }
1555
1556 assert(EltSize == 32 && "invalid reg indexing elt size");
1558}
1559
1560static unsigned getSGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1561 switch (Size) {
1562 case 4:
1563 return NeedsCFI ? AMDGPU::SI_SPILL_S32_CFI_SAVE : AMDGPU::SI_SPILL_S32_SAVE;
1564 case 8:
1565 return NeedsCFI ? AMDGPU::SI_SPILL_S64_CFI_SAVE : AMDGPU::SI_SPILL_S64_SAVE;
1566 case 12:
1567 return NeedsCFI ? AMDGPU::SI_SPILL_S96_CFI_SAVE : AMDGPU::SI_SPILL_S96_SAVE;
1568 case 16:
1569 return NeedsCFI ? AMDGPU::SI_SPILL_S128_CFI_SAVE
1570 : AMDGPU::SI_SPILL_S128_SAVE;
1571 case 20:
1572 return NeedsCFI ? AMDGPU::SI_SPILL_S160_CFI_SAVE
1573 : AMDGPU::SI_SPILL_S160_SAVE;
1574 case 24:
1575 return NeedsCFI ? AMDGPU::SI_SPILL_S192_CFI_SAVE
1576 : AMDGPU::SI_SPILL_S192_SAVE;
1577 case 28:
1578 return NeedsCFI ? AMDGPU::SI_SPILL_S224_CFI_SAVE
1579 : AMDGPU::SI_SPILL_S224_SAVE;
1580 case 32:
1581 return AMDGPU::SI_SPILL_S256_SAVE;
1582 case 36:
1583 return AMDGPU::SI_SPILL_S288_SAVE;
1584 case 40:
1585 return AMDGPU::SI_SPILL_S320_SAVE;
1586 case 44:
1587 return AMDGPU::SI_SPILL_S352_SAVE;
1588 case 48:
1589 return AMDGPU::SI_SPILL_S384_SAVE;
1590 case 64:
1591 return NeedsCFI ? AMDGPU::SI_SPILL_S512_CFI_SAVE
1592 : AMDGPU::SI_SPILL_S512_SAVE;
1593 case 128:
1594 return NeedsCFI ? AMDGPU::SI_SPILL_S1024_CFI_SAVE
1595 : AMDGPU::SI_SPILL_S1024_SAVE;
1596 default:
1597 llvm_unreachable("unknown register size");
1598 }
1599}
1600
1601static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1602 switch (Size) {
1603 case 2:
1604 return AMDGPU::SI_SPILL_V16_SAVE;
1605 case 4:
1606 return NeedsCFI ? AMDGPU::SI_SPILL_V32_CFI_SAVE : AMDGPU::SI_SPILL_V32_SAVE;
1607 case 8:
1608 return NeedsCFI ? AMDGPU::SI_SPILL_V64_CFI_SAVE : AMDGPU::SI_SPILL_V64_SAVE;
1609 case 12:
1610 return NeedsCFI ? AMDGPU::SI_SPILL_V96_CFI_SAVE : AMDGPU::SI_SPILL_V96_SAVE;
1611 case 16:
1612 return NeedsCFI ? AMDGPU::SI_SPILL_V128_CFI_SAVE
1613 : AMDGPU::SI_SPILL_V128_SAVE;
1614 case 20:
1615 return NeedsCFI ? AMDGPU::SI_SPILL_V160_CFI_SAVE
1616 : AMDGPU::SI_SPILL_V160_SAVE;
1617 case 24:
1618 return NeedsCFI ? AMDGPU::SI_SPILL_V192_CFI_SAVE
1619 : AMDGPU::SI_SPILL_V192_SAVE;
1620 case 28:
1621 return NeedsCFI ? AMDGPU::SI_SPILL_V224_CFI_SAVE
1622 : AMDGPU::SI_SPILL_V224_SAVE;
1623 case 32:
1624 return NeedsCFI ? AMDGPU::SI_SPILL_V256_CFI_SAVE
1625 : AMDGPU::SI_SPILL_V256_SAVE;
1626 case 36:
1627 return NeedsCFI ? AMDGPU::SI_SPILL_V288_CFI_SAVE
1628 : AMDGPU::SI_SPILL_V288_SAVE;
1629 case 40:
1630 return NeedsCFI ? AMDGPU::SI_SPILL_V320_CFI_SAVE
1631 : AMDGPU::SI_SPILL_V320_SAVE;
1632 case 44:
1633 return NeedsCFI ? AMDGPU::SI_SPILL_V352_CFI_SAVE
1634 : AMDGPU::SI_SPILL_V352_SAVE;
1635 case 48:
1636 return NeedsCFI ? AMDGPU::SI_SPILL_V384_CFI_SAVE
1637 : AMDGPU::SI_SPILL_V384_SAVE;
1638 case 64:
1639 return NeedsCFI ? AMDGPU::SI_SPILL_V512_CFI_SAVE
1640 : AMDGPU::SI_SPILL_V512_SAVE;
1641 case 128:
1642 return NeedsCFI ? AMDGPU::SI_SPILL_V1024_CFI_SAVE
1643 : AMDGPU::SI_SPILL_V1024_SAVE;
1644 default:
1645 llvm_unreachable("unknown register size");
1646 }
1647}
1648
1649static unsigned getAVSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1650 switch (Size) {
1651 case 4:
1652 return NeedsCFI ? AMDGPU::SI_SPILL_AV32_CFI_SAVE
1653 : AMDGPU::SI_SPILL_AV32_SAVE;
1654 case 8:
1655 return NeedsCFI ? AMDGPU::SI_SPILL_AV64_CFI_SAVE
1656 : AMDGPU::SI_SPILL_AV64_SAVE;
1657 case 12:
1658 return NeedsCFI ? AMDGPU::SI_SPILL_AV96_CFI_SAVE
1659 : AMDGPU::SI_SPILL_AV96_SAVE;
1660 case 16:
1661 return NeedsCFI ? AMDGPU::SI_SPILL_AV128_CFI_SAVE
1662 : AMDGPU::SI_SPILL_AV128_SAVE;
1663 case 20:
1664 return NeedsCFI ? AMDGPU::SI_SPILL_AV160_CFI_SAVE
1665 : AMDGPU::SI_SPILL_AV160_SAVE;
1666 case 24:
1667 return NeedsCFI ? AMDGPU::SI_SPILL_AV192_CFI_SAVE
1668 : AMDGPU::SI_SPILL_AV192_SAVE;
1669 case 28:
1670 return NeedsCFI ? AMDGPU::SI_SPILL_AV224_CFI_SAVE
1671 : AMDGPU::SI_SPILL_AV224_SAVE;
1672 case 32:
1673 return NeedsCFI ? AMDGPU::SI_SPILL_AV256_CFI_SAVE
1674 : AMDGPU::SI_SPILL_AV256_SAVE;
1675 case 36:
1676 return AMDGPU::SI_SPILL_AV288_SAVE;
1677 case 40:
1678 return AMDGPU::SI_SPILL_AV320_SAVE;
1679 case 44:
1680 return AMDGPU::SI_SPILL_AV352_SAVE;
1681 case 48:
1682 return AMDGPU::SI_SPILL_AV384_SAVE;
1683 case 64:
1684 return NeedsCFI ? AMDGPU::SI_SPILL_AV512_CFI_SAVE
1685 : AMDGPU::SI_SPILL_AV512_SAVE;
1686 case 128:
1687 return NeedsCFI ? AMDGPU::SI_SPILL_AV1024_CFI_SAVE
1688 : AMDGPU::SI_SPILL_AV1024_SAVE;
1689 default:
1690 llvm_unreachable("unknown register size");
1691 }
1692}
1693
1694static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1695 bool IsVectorSuperClass) {
1696 // Currently, there is only 32-bit WWM register spills needed.
1697 if (Size != 4)
1698 llvm_unreachable("unknown wwm register spill size");
1699
1700 if (IsVectorSuperClass)
1701 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1702
1703 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1704}
1705
1707 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1708 const SIMachineFunctionInfo &MFI, bool NeedsCFI) const {
1709 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1710
1711 // Choose the right opcode if spilling a WWM register.
1713 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1714
1715 // TODO: Check if AGPRs are available
1716 if (ST.hasMAIInsts())
1717 return getAVSpillSaveOpcode(Size, NeedsCFI);
1718
1719 return getVGPRSpillSaveOpcode(Size, NeedsCFI);
1720}
1721
1722void SIInstrInfo::storeRegToStackSlotImpl(
1724 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1725 MachineInstr::MIFlag Flags, bool NeedsCFI) const {
1726 MachineFunction *MF = MBB.getParent();
1728 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1729 const DebugLoc &DL = MBB.findDebugLoc(MI);
1730
1731 MachinePointerInfo PtrInfo
1732 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1734 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1735 FrameInfo.getObjectAlign(FrameIndex));
1736 unsigned SpillSize = RI.getSpillSize(*RC);
1737
1738 MachineRegisterInfo &MRI = MF->getRegInfo();
1739 if (RI.isSGPRClass(RC)) {
1740 if (FrameInfo.getStackID(FrameIndex) == TargetStackID::SGPRSpill)
1741 MFI->setHasSpilledSGPRs();
1742 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1743 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1744 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1745
1746 // We are only allowed to create one new instruction when spilling
1747 // registers, so we need to use pseudo instruction for spilling SGPRs.
1748 const MCInstrDesc &OpDesc =
1749 get(getSGPRSpillSaveOpcode(SpillSize, NeedsCFI));
1750
1751 // The SGPR spill/restore instructions only work on number sgprs, so we need
1752 // to make sure we are using the correct register class.
1753 if (SrcReg.isVirtual() && SpillSize == 4) {
1754 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1755 }
1756
1757 BuildMI(MBB, MI, DL, OpDesc)
1758 .addReg(SrcReg, getKillRegState(isKill)) // data
1759 .addFrameIndex(FrameIndex) // addr
1760 .addMemOperand(MMO)
1762
1763 return;
1764 }
1765
1766 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1767 SpillSize, *MFI, NeedsCFI);
1768 MFI->setHasSpilledVGPRs();
1769
1770 BuildMI(MBB, MI, DL, get(Opcode))
1771 .addReg(SrcReg, getKillRegState(isKill)) // data
1772 .addFrameIndex(FrameIndex) // addr
1773 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1774 .addImm(0) // offset
1775 .addMemOperand(MMO);
1776}
1777
1780 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1781 MachineInstr::MIFlag Flags) const {
1782 storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, VReg, Flags,
1783 false);
1784}
1785
1788 Register SrcReg, bool isKill,
1789 int FrameIndex,
1790 const TargetRegisterClass *RC) const {
1791 storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, Register(),
1792 MachineInstr::NoFlags, true);
1793}
1794
1795static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1796 switch (Size) {
1797 case 4:
1798 return AMDGPU::SI_SPILL_S32_RESTORE;
1799 case 8:
1800 return AMDGPU::SI_SPILL_S64_RESTORE;
1801 case 12:
1802 return AMDGPU::SI_SPILL_S96_RESTORE;
1803 case 16:
1804 return AMDGPU::SI_SPILL_S128_RESTORE;
1805 case 20:
1806 return AMDGPU::SI_SPILL_S160_RESTORE;
1807 case 24:
1808 return AMDGPU::SI_SPILL_S192_RESTORE;
1809 case 28:
1810 return AMDGPU::SI_SPILL_S224_RESTORE;
1811 case 32:
1812 return AMDGPU::SI_SPILL_S256_RESTORE;
1813 case 36:
1814 return AMDGPU::SI_SPILL_S288_RESTORE;
1815 case 40:
1816 return AMDGPU::SI_SPILL_S320_RESTORE;
1817 case 44:
1818 return AMDGPU::SI_SPILL_S352_RESTORE;
1819 case 48:
1820 return AMDGPU::SI_SPILL_S384_RESTORE;
1821 case 64:
1822 return AMDGPU::SI_SPILL_S512_RESTORE;
1823 case 128:
1824 return AMDGPU::SI_SPILL_S1024_RESTORE;
1825 default:
1826 llvm_unreachable("unknown register size");
1827 }
1828}
1829
1830static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1831 switch (Size) {
1832 case 2:
1833 return AMDGPU::SI_SPILL_V16_RESTORE;
1834 case 4:
1835 return AMDGPU::SI_SPILL_V32_RESTORE;
1836 case 8:
1837 return AMDGPU::SI_SPILL_V64_RESTORE;
1838 case 12:
1839 return AMDGPU::SI_SPILL_V96_RESTORE;
1840 case 16:
1841 return AMDGPU::SI_SPILL_V128_RESTORE;
1842 case 20:
1843 return AMDGPU::SI_SPILL_V160_RESTORE;
1844 case 24:
1845 return AMDGPU::SI_SPILL_V192_RESTORE;
1846 case 28:
1847 return AMDGPU::SI_SPILL_V224_RESTORE;
1848 case 32:
1849 return AMDGPU::SI_SPILL_V256_RESTORE;
1850 case 36:
1851 return AMDGPU::SI_SPILL_V288_RESTORE;
1852 case 40:
1853 return AMDGPU::SI_SPILL_V320_RESTORE;
1854 case 44:
1855 return AMDGPU::SI_SPILL_V352_RESTORE;
1856 case 48:
1857 return AMDGPU::SI_SPILL_V384_RESTORE;
1858 case 64:
1859 return AMDGPU::SI_SPILL_V512_RESTORE;
1860 case 128:
1861 return AMDGPU::SI_SPILL_V1024_RESTORE;
1862 default:
1863 llvm_unreachable("unknown register size");
1864 }
1865}
1866
1867static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1868 switch (Size) {
1869 case 4:
1870 return AMDGPU::SI_SPILL_AV32_RESTORE;
1871 case 8:
1872 return AMDGPU::SI_SPILL_AV64_RESTORE;
1873 case 12:
1874 return AMDGPU::SI_SPILL_AV96_RESTORE;
1875 case 16:
1876 return AMDGPU::SI_SPILL_AV128_RESTORE;
1877 case 20:
1878 return AMDGPU::SI_SPILL_AV160_RESTORE;
1879 case 24:
1880 return AMDGPU::SI_SPILL_AV192_RESTORE;
1881 case 28:
1882 return AMDGPU::SI_SPILL_AV224_RESTORE;
1883 case 32:
1884 return AMDGPU::SI_SPILL_AV256_RESTORE;
1885 case 36:
1886 return AMDGPU::SI_SPILL_AV288_RESTORE;
1887 case 40:
1888 return AMDGPU::SI_SPILL_AV320_RESTORE;
1889 case 44:
1890 return AMDGPU::SI_SPILL_AV352_RESTORE;
1891 case 48:
1892 return AMDGPU::SI_SPILL_AV384_RESTORE;
1893 case 64:
1894 return AMDGPU::SI_SPILL_AV512_RESTORE;
1895 case 128:
1896 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1897 default:
1898 llvm_unreachable("unknown register size");
1899 }
1900}
1901
1902static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1903 bool IsVectorSuperClass) {
1904 // Currently, there is only 32-bit WWM register spills needed.
1905 if (Size != 4)
1906 llvm_unreachable("unknown wwm register spill size");
1907
1908 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1909 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1910
1911 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1912}
1913
1915 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1916 const SIMachineFunctionInfo &MFI) const {
1917 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1918
1919 // Choose the right opcode if restoring a WWM register.
1921 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1922
1923 // TODO: Check if AGPRs are available
1924 if (ST.hasMAIInsts())
1926
1927 assert(!RI.isAGPRClass(RC));
1929}
1930
1933 Register DestReg, int FrameIndex,
1934 const TargetRegisterClass *RC,
1935 Register VReg, unsigned SubReg,
1936 MachineInstr::MIFlag Flags) const {
1937 MachineFunction *MF = MBB.getParent();
1939 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1940 const DebugLoc &DL = MBB.findDebugLoc(MI);
1941 unsigned SpillSize = RI.getSpillSize(*RC);
1942
1943 MachinePointerInfo PtrInfo
1944 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1945
1947 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1948 FrameInfo.getObjectAlign(FrameIndex));
1949
1950 if (RI.isSGPRClass(RC)) {
1951 if (FrameInfo.getStackID(FrameIndex) == TargetStackID::SGPRSpill)
1952 MFI->setHasSpilledSGPRs();
1953 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1954 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1955 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1956
1957 // FIXME: Maybe this should not include a memoperand because it will be
1958 // lowered to non-memory instructions.
1959 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1960 if (DestReg.isVirtual() && SpillSize == 4) {
1961 MachineRegisterInfo &MRI = MF->getRegInfo();
1962 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1963 }
1964
1965 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1966 .addFrameIndex(FrameIndex) // addr
1967 .addMemOperand(MMO)
1969
1970 return;
1971 }
1972
1973 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1974 SpillSize, *MFI);
1975 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1976 .addFrameIndex(FrameIndex) // vaddr
1977 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1978 .addImm(0) // offset
1979 .addMemOperand(MMO);
1980}
1981
1986
1989 unsigned Quantity) const {
1990 DebugLoc DL = MBB.findDebugLoc(MI);
1991 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1992 while (Quantity > 0) {
1993 unsigned Arg = std::min(Quantity, MaxSNopCount);
1994 Quantity -= Arg;
1995 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1996 }
1997}
1998
2000 auto *MF = MBB.getParent();
2002
2003 assert(Info->isEntryFunction());
2004
2005 if (MBB.succ_empty()) {
2006 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2007 if (HasNoTerminator) {
2008 if (Info->returnsVoid()) {
2009 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2010 } else {
2011 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2012 }
2013 }
2014 }
2015}
2016
2020 const DebugLoc &DL) const {
2021 MachineFunction *MF = MBB.getParent();
2022 constexpr unsigned DoorbellIDMask = 0x3ff;
2023 constexpr unsigned ECQueueWaveAbort = 0x400;
2024
2025 MachineBasicBlock *TrapBB = &MBB;
2026 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2027
2028 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2029 MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2030 TrapBB = MF->CreateMachineBasicBlock();
2031 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2032 MF->push_back(TrapBB);
2033 MBB.addSuccessor(TrapBB);
2034 }
2035 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2036 // will be a nop.
2037 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2038 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2039 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2040 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2041 DoorbellReg)
2043 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2044 .addUse(AMDGPU::M0);
2045 Register DoorbellRegMasked =
2046 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2047 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2048 .addUse(DoorbellReg)
2049 .addImm(DoorbellIDMask);
2050 Register SetWaveAbortBit =
2051 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2052 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2053 .addUse(DoorbellRegMasked)
2054 .addImm(ECQueueWaveAbort);
2055 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2056 .addUse(SetWaveAbortBit);
2057 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2059 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2060 .addUse(AMDGPU::TTMP2);
2061 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2062 TrapBB->addSuccessor(HaltLoopBB);
2063
2064 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2065 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2066 .addMBB(HaltLoopBB);
2067 MF->push_back(HaltLoopBB);
2068 HaltLoopBB->addSuccessor(HaltLoopBB);
2069
2070 return MBB.getNextNode();
2071}
2072
2074 switch (MI.getOpcode()) {
2075 default:
2076 if (MI.isMetaInstruction())
2077 return 0;
2078 return 1; // FIXME: Do wait states equal cycles?
2079
2080 case AMDGPU::S_NOP:
2081 return MI.getOperand(0).getImm() + 1;
2082 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2083 // hazard, even if one exist, won't really be visible. Should we handle it?
2084 }
2085}
2086
2088 MachineBasicBlock &MBB = *MI.getParent();
2089 DebugLoc DL = MBB.findDebugLoc(MI);
2091 switch (MI.getOpcode()) {
2092 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2093 case AMDGPU::S_MOV_B64_term:
2094 // This is only a terminator to get the correct spill code placement during
2095 // register allocation.
2096 MI.setDesc(get(AMDGPU::S_MOV_B64));
2097 break;
2098
2099 case AMDGPU::S_MOV_B32_term:
2100 // This is only a terminator to get the correct spill code placement during
2101 // register allocation.
2102 MI.setDesc(get(AMDGPU::S_MOV_B32));
2103 break;
2104
2105 case AMDGPU::S_XOR_B64_term:
2106 // This is only a terminator to get the correct spill code placement during
2107 // register allocation.
2108 MI.setDesc(get(AMDGPU::S_XOR_B64));
2109 break;
2110
2111 case AMDGPU::S_XOR_B32_term:
2112 // This is only a terminator to get the correct spill code placement during
2113 // register allocation.
2114 MI.setDesc(get(AMDGPU::S_XOR_B32));
2115 break;
2116 case AMDGPU::S_OR_B64_term:
2117 // This is only a terminator to get the correct spill code placement during
2118 // register allocation.
2119 MI.setDesc(get(AMDGPU::S_OR_B64));
2120 break;
2121 case AMDGPU::S_OR_B32_term:
2122 // This is only a terminator to get the correct spill code placement during
2123 // register allocation.
2124 MI.setDesc(get(AMDGPU::S_OR_B32));
2125 break;
2126
2127 case AMDGPU::S_ANDN2_B64_term:
2128 // This is only a terminator to get the correct spill code placement during
2129 // register allocation.
2130 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2131 break;
2132
2133 case AMDGPU::S_ANDN2_B32_term:
2134 // This is only a terminator to get the correct spill code placement during
2135 // register allocation.
2136 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2137 break;
2138
2139 case AMDGPU::S_AND_B64_term:
2140 // This is only a terminator to get the correct spill code placement during
2141 // register allocation.
2142 MI.setDesc(get(AMDGPU::S_AND_B64));
2143 break;
2144
2145 case AMDGPU::S_AND_B32_term:
2146 // This is only a terminator to get the correct spill code placement during
2147 // register allocation.
2148 MI.setDesc(get(AMDGPU::S_AND_B32));
2149 break;
2150
2151 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2152 // This is only a terminator to get the correct spill code placement during
2153 // register allocation.
2154 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2155 break;
2156
2157 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2158 // This is only a terminator to get the correct spill code placement during
2159 // register allocation.
2160 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2161 break;
2162
2163 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2164 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2165 break;
2166
2167 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2168 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2169 break;
2170 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2171 Register Dst = MI.getOperand(0).getReg();
2172 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2173 MI.setDesc(
2174 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2175 break;
2176 }
2177 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2178 Register Dst = MI.getOperand(0).getReg();
2179 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2180 int64_t Imm = MI.getOperand(1).getImm();
2181
2182 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2183 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2184 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2185 .addImm(SignExtend64<32>(Imm));
2186 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2187 .addImm(SignExtend64<32>(Imm >> 32));
2188 MI.eraseFromParent();
2189 break;
2190 }
2191
2192 [[fallthrough]];
2193 }
2194 case AMDGPU::V_MOV_B64_PSEUDO: {
2195 Register Dst = MI.getOperand(0).getReg();
2196 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2197 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2198
2199 const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
2200 const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
2201
2202 const MachineOperand &SrcOp = MI.getOperand(1);
2203 // FIXME: Will this work for 64-bit floating point immediates?
2204 assert(!SrcOp.isFPImm());
2205 if (ST.hasVMovB64Inst() && Mov64RC->contains(Dst)) {
2206 MI.setDesc(Mov64Desc);
2207 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2208 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2209 break;
2210 }
2211 if (SrcOp.isImm()) {
2212 APInt Imm(64, SrcOp.getImm());
2213 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2214 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2215 const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
2216 const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
2217
2218 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
2219 PkMovRC->contains(Dst)) {
2220 BuildMI(MBB, MI, DL, PkMovDesc, Dst)
2222 .addImm(Lo.getSExtValue())
2224 .addImm(Lo.getSExtValue())
2225 .addImm(0) // op_sel_lo
2226 .addImm(0) // op_sel_hi
2227 .addImm(0) // neg_lo
2228 .addImm(0) // neg_hi
2229 .addImm(0); // clamp
2230 } else {
2231 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2232 .addImm(Lo.getSExtValue());
2233 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2234 .addImm(Hi.getSExtValue());
2235 }
2236 } else {
2237 assert(SrcOp.isReg());
2238 if (ST.hasPkMovB32() &&
2239 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2240 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2241 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2242 .addReg(SrcOp.getReg())
2244 .addReg(SrcOp.getReg())
2245 .addImm(0) // op_sel_lo
2246 .addImm(0) // op_sel_hi
2247 .addImm(0) // neg_lo
2248 .addImm(0) // neg_hi
2249 .addImm(0); // clamp
2250 } else {
2251 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2252 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0));
2253 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2254 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1));
2255 }
2256 }
2257 MI.eraseFromParent();
2258 break;
2259 }
2260 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2262 break;
2263 }
2264 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2265 const MachineOperand &SrcOp = MI.getOperand(1);
2266 assert(!SrcOp.isFPImm());
2267
2268 if (ST.has64BitLiterals()) {
2269 MI.setDesc(get(AMDGPU::S_MOV_B64));
2270 break;
2271 }
2272
2273 APInt Imm(64, SrcOp.getImm());
2274 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2275 MI.setDesc(get(AMDGPU::S_MOV_B64));
2276 break;
2277 }
2278
2279 Register Dst = MI.getOperand(0).getReg();
2280 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2281 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2282
2283 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2284 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2285 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2286 .addImm(Lo.getSExtValue());
2287 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2288 .addImm(Hi.getSExtValue());
2289 MI.eraseFromParent();
2290 break;
2291 }
2292 case AMDGPU::V_SET_INACTIVE_B32: {
2293 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2294 Register DstReg = MI.getOperand(0).getReg();
2295 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2296 .add(MI.getOperand(3))
2297 .add(MI.getOperand(4))
2298 .add(MI.getOperand(1))
2299 .add(MI.getOperand(2))
2300 .add(MI.getOperand(5));
2301 MI.eraseFromParent();
2302 break;
2303 }
2304 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2305 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2306 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2307 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2308 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2309 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2310 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2311 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2317 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2324 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2325 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2326 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2327 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2328 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2329 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2330 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2331 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2332 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2333 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2334 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2335 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2336 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2337 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2338
2339 unsigned Opc;
2340 if (RI.hasVGPRs(EltRC)) {
2341 Opc = AMDGPU::V_MOVRELD_B32_e32;
2342 } else {
2343 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2344 : AMDGPU::S_MOVRELD_B32;
2345 }
2346
2347 const MCInstrDesc &OpDesc = get(Opc);
2348 Register VecReg = MI.getOperand(0).getReg();
2349 bool IsUndef = MI.getOperand(1).isUndef();
2350 unsigned SubReg = MI.getOperand(3).getImm();
2351 assert(VecReg == MI.getOperand(1).getReg());
2352
2354 BuildMI(MBB, MI, DL, OpDesc)
2355 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2356 .add(MI.getOperand(2))
2358 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2359
2360 const int ImpDefIdx =
2361 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2362 const int ImpUseIdx = ImpDefIdx + 1;
2363 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2364 MI.eraseFromParent();
2365 break;
2366 }
2367 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2368 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2369 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2370 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2371 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2372 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2373 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2374 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2375 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2376 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2377 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2378 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2379 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2380 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2381 assert(ST.useVGPRIndexMode());
2382 Register VecReg = MI.getOperand(0).getReg();
2383 bool IsUndef = MI.getOperand(1).isUndef();
2384 MachineOperand &Idx = MI.getOperand(3);
2385 Register SubReg = MI.getOperand(4).getImm();
2386
2387 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2388 .add(Idx)
2390 SetOn->getOperand(3).setIsUndef();
2391
2392 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2394 BuildMI(MBB, MI, DL, OpDesc)
2395 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2396 .add(MI.getOperand(2))
2398 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2399
2400 const int ImpDefIdx =
2401 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2402 const int ImpUseIdx = ImpDefIdx + 1;
2403 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2404
2405 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2406
2407 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2408
2409 MI.eraseFromParent();
2410 break;
2411 }
2412 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2413 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2414 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2415 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2416 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2417 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2418 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2419 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2420 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2421 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2422 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2423 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2424 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2425 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2426 assert(ST.useVGPRIndexMode());
2427 Register Dst = MI.getOperand(0).getReg();
2428 Register VecReg = MI.getOperand(1).getReg();
2429 bool IsUndef = MI.getOperand(1).isUndef();
2430 Register SubReg = MI.getOperand(3).getImm();
2431
2432 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2433 .add(MI.getOperand(2))
2435 SetOn->getOperand(3).setIsUndef();
2436
2437 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2438 .addDef(Dst)
2439 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2440 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2441
2442 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2443
2444 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2445
2446 MI.eraseFromParent();
2447 break;
2448 }
2449 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2450 MachineFunction &MF = *MBB.getParent();
2451 Register Reg = MI.getOperand(0).getReg();
2452 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2453 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2454 MachineOperand OpLo = MI.getOperand(1);
2455 MachineOperand OpHi = MI.getOperand(2);
2456
2457 // Create a bundle so these instructions won't be re-ordered by the
2458 // post-RA scheduler.
2459 MIBundleBuilder Bundler(MBB, MI);
2460 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2461
2462 // What we want here is an offset from the value returned by s_getpc (which
2463 // is the address of the s_add_u32 instruction) to the global variable, but
2464 // since the encoding of $symbol starts 4 bytes after the start of the
2465 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2466 // small. This requires us to add 4 to the global variable offset in order
2467 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2468 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2469 // instruction.
2470
2471 int64_t Adjust = 0;
2472 if (ST.hasGetPCZeroExtension()) {
2473 // Fix up hardware that does not sign-extend the 48-bit PC value by
2474 // inserting: s_sext_i32_i16 reghi, reghi
2475 Bundler.append(
2476 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2477 Adjust += 4;
2478 }
2479
2480 if (OpLo.isGlobal())
2481 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2482 Bundler.append(
2483 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2484
2485 if (OpHi.isGlobal())
2486 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2487 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2488 .addReg(RegHi)
2489 .add(OpHi));
2490
2491 finalizeBundle(MBB, Bundler.begin());
2492
2493 MI.eraseFromParent();
2494 break;
2495 }
2496 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2497 MachineFunction &MF = *MBB.getParent();
2498 Register Reg = MI.getOperand(0).getReg();
2499 MachineOperand Op = MI.getOperand(1);
2500
2501 // Create a bundle so these instructions won't be re-ordered by the
2502 // post-RA scheduler.
2503 MIBundleBuilder Bundler(MBB, MI);
2504 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2505 if (Op.isGlobal())
2506 Op.setOffset(Op.getOffset() + 4);
2507 Bundler.append(
2508 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2509
2510 finalizeBundle(MBB, Bundler.begin());
2511
2512 MI.eraseFromParent();
2513 break;
2514 }
2515 case AMDGPU::ENTER_STRICT_WWM: {
2516 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2517 // Whole Wave Mode is entered.
2518 MI.setDesc(get(LMC.OrSaveExecOpc));
2519 break;
2520 }
2521 case AMDGPU::ENTER_STRICT_WQM: {
2522 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2523 // STRICT_WQM is entered.
2524 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2525 .addReg(LMC.ExecReg);
2526 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2527
2528 MI.eraseFromParent();
2529 break;
2530 }
2531 case AMDGPU::EXIT_STRICT_WWM:
2532 case AMDGPU::EXIT_STRICT_WQM: {
2533 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2534 // WWM/STICT_WQM is exited.
2535 MI.setDesc(get(LMC.MovOpc));
2536 break;
2537 }
2538 case AMDGPU::SI_RETURN: {
2539 const MachineFunction *MF = MBB.getParent();
2540 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2541 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2542 // Hiding the return address use with SI_RETURN may lead to extra kills in
2543 // the function and missing live-ins. We are fine in practice because callee
2544 // saved register handling ensures the register value is restored before
2545 // RET, but we need the undef flag here to appease the MachineVerifier
2546 // liveness checks.
2548 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2549 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2550
2551 MIB.copyImplicitOps(MI);
2552 MI.eraseFromParent();
2553 break;
2554 }
2555
2556 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2557 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2558 MI.setDesc(get(AMDGPU::S_MUL_U64));
2559 break;
2560
2561 case AMDGPU::S_GETPC_B64_pseudo:
2562 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2563 if (ST.hasGetPCZeroExtension()) {
2564 Register Dst = MI.getOperand(0).getReg();
2565 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2566 // Fix up hardware that does not sign-extend the 48-bit PC value by
2567 // inserting: s_sext_i32_i16 dsthi, dsthi
2568 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2569 DstHi)
2570 .addReg(DstHi);
2571 }
2572 break;
2573
2574 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2575 assert(ST.hasBF16PackedInsts());
2576 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2577 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2578 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2579 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2580 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2581 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2582 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2583 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2584 break;
2585 }
2586
2587 case AMDGPU::GET_STACK_BASE:
2588 // The stack starts at offset 0 unless we need to reserve some space at the
2589 // bottom.
2590 if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
2591 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2592 // some of the VGPRs. The size of the required scratch space has already
2593 // been computed by prolog epilog insertion.
2594 const SIMachineFunctionInfo *MFI =
2595 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2596 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2597 Register DestReg = MI.getOperand(0).getReg();
2598 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
2601 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2602 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2603 // SCC, so we need to check for 0 manually.
2604 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
2605 // Change the implicif-def of SCC to an explicit use (but first remove
2606 // the dead flag if present).
2607 MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
2608 MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
2609 MI.setDesc(get(AMDGPU::S_CMOVK_I32));
2610 MI.addOperand(MachineOperand::CreateImm(VGPRSize));
2611 } else {
2612 MI.setDesc(get(AMDGPU::S_MOV_B32));
2613 MI.addOperand(MachineOperand::CreateImm(0));
2614 MI.removeOperand(
2615 MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2616 }
2617 break;
2618 }
2619
2620 return true;
2621}
2622
2625 unsigned SubIdx, const MachineInstr &Orig,
2626 LaneBitmask UsedLanes) const {
2627
2628 // Try shrinking the instruction to remat only the part needed for current
2629 // context.
2630 // TODO: Handle more cases.
2631 unsigned Opcode = Orig.getOpcode();
2632 switch (Opcode) {
2633 case AMDGPU::S_MOV_B64:
2634 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2635 if (SubIdx != 0)
2636 break;
2637
2638 if (!Orig.getOperand(1).isImm())
2639 break;
2640
2641 // Shrink S_MOV_B64 to S_MOV_B32 when UsedLanes indicates only a single
2642 // 32-bit lane of the 64-bit value is live at the rematerialization point.
2643 if (UsedLanes.all())
2644 break;
2645
2646 // Determine which half of the 64-bit immediate corresponds to the use.
2647 unsigned OrigSubReg = Orig.getOperand(0).getSubReg();
2648 unsigned LoSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub0);
2649 unsigned HiSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub1);
2650
2651 bool NeedLo = (UsedLanes & RI.getSubRegIndexLaneMask(LoSubReg)).any();
2652 bool NeedHi = (UsedLanes & RI.getSubRegIndexLaneMask(HiSubReg)).any();
2653
2654 if (NeedLo && NeedHi)
2655 break;
2656
2657 int64_t Imm64 = Orig.getOperand(1).getImm();
2658 int32_t Imm32 = NeedLo ? Lo_32(Imm64) : Hi_32(Imm64);
2659
2660 unsigned UseSubReg = NeedLo ? LoSubReg : HiSubReg;
2661
2662 // Emit S_MOV_B32 defining just the needed 32-bit subreg of DestReg.
2663 BuildMI(MBB, I, Orig.getDebugLoc(), get(AMDGPU::S_MOV_B32))
2664 .addReg(DestReg, RegState::Define | RegState::Undef, UseSubReg)
2665 .addImm(Imm32);
2666 return;
2667 }
2668
2669 case AMDGPU::S_LOAD_DWORDX16_IMM:
2670 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2671 if (SubIdx != 0)
2672 break;
2673
2674 if (I == MBB.end())
2675 break;
2676
2677 if (I->isBundled())
2678 break;
2679
2680 // Look for a single use of the register that is also a subreg.
2681 Register RegToFind = Orig.getOperand(0).getReg();
2682 MachineOperand *UseMO = nullptr;
2683 for (auto &CandMO : I->operands()) {
2684 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2685 continue;
2686 if (UseMO) {
2687 UseMO = nullptr;
2688 break;
2689 }
2690 UseMO = &CandMO;
2691 }
2692 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2693 break;
2694
2695 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2696 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2697
2698 MachineFunction *MF = MBB.getParent();
2699 MachineRegisterInfo &MRI = MF->getRegInfo();
2700 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2701
2702 unsigned NewOpcode = -1;
2703 if (SubregSize == 256)
2704 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2705 else if (SubregSize == 128)
2706 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2707 else
2708 break;
2709
2710 const MCInstrDesc &TID = get(NewOpcode);
2711 const TargetRegisterClass *NewRC =
2712 RI.getAllocatableClass(getRegClass(TID, 0));
2713 MRI.setRegClass(DestReg, NewRC);
2714
2715 UseMO->setReg(DestReg);
2716 UseMO->setSubReg(AMDGPU::NoSubRegister);
2717
2718 // Use a smaller load with the desired size, possibly with updated offset.
2719 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2720 MI->setDesc(TID);
2721 MI->getOperand(0).setReg(DestReg);
2722 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2723 if (Offset) {
2724 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2725 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2726 OffsetMO->setImm(FinalOffset);
2727 }
2729 for (const MachineMemOperand *MemOp : Orig.memoperands())
2730 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2731 SubregSize / 8));
2732 MI->setMemRefs(*MF, NewMMOs);
2733
2734 MBB.insert(I, MI);
2735 return;
2736 }
2737
2738 default:
2739 break;
2740 }
2741
2742 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, UsedLanes);
2743}
2744
2745std::pair<MachineInstr*, MachineInstr*>
2747 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2748
2749 if (ST.hasVMovB64Inst() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2751 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2752 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2753 return std::pair(&MI, nullptr);
2754 }
2755
2756 MachineBasicBlock &MBB = *MI.getParent();
2757 DebugLoc DL = MBB.findDebugLoc(MI);
2758 MachineFunction *MF = MBB.getParent();
2759 MachineRegisterInfo &MRI = MF->getRegInfo();
2760 Register Dst = MI.getOperand(0).getReg();
2761 unsigned Part = 0;
2762 MachineInstr *Split[2];
2763
2764 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2765 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2766 if (Dst.isPhysical()) {
2767 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2768 } else {
2769 assert(MRI.isSSA());
2770 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2771 MovDPP.addDef(Tmp);
2772 }
2773
2774 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2775 const MachineOperand &SrcOp = MI.getOperand(I);
2776 assert(!SrcOp.isFPImm());
2777 if (SrcOp.isImm()) {
2778 APInt Imm(64, SrcOp.getImm());
2779 Imm.ashrInPlace(Part * 32);
2780 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2781 } else {
2782 assert(SrcOp.isReg());
2783 Register Src = SrcOp.getReg();
2784 if (Src.isPhysical())
2785 MovDPP.addReg(RI.getSubReg(Src, Sub));
2786 else
2787 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2788 }
2789 }
2790
2791 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2792 MovDPP.addImm(MO.getImm());
2793
2794 Split[Part] = MovDPP;
2795 ++Part;
2796 }
2797
2798 if (Dst.isVirtual())
2799 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2800 .addReg(Split[0]->getOperand(0).getReg())
2801 .addImm(AMDGPU::sub0)
2802 .addReg(Split[1]->getOperand(0).getReg())
2803 .addImm(AMDGPU::sub1);
2804
2805 MI.eraseFromParent();
2806 return std::pair(Split[0], Split[1]);
2807}
2808
2809std::optional<DestSourcePair>
2811 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2812 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2813
2814 return std::nullopt;
2815}
2816
2818 AMDGPU::OpName Src0OpName,
2819 MachineOperand &Src1,
2820 AMDGPU::OpName Src1OpName) const {
2821 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2822 if (!Src0Mods)
2823 return false;
2824
2825 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2826 assert(Src1Mods &&
2827 "All commutable instructions have both src0 and src1 modifiers");
2828
2829 int Src0ModsVal = Src0Mods->getImm();
2830 int Src1ModsVal = Src1Mods->getImm();
2831
2832 Src1Mods->setImm(Src0ModsVal);
2833 Src0Mods->setImm(Src1ModsVal);
2834 return true;
2835}
2836
2838 MachineOperand &RegOp,
2839 MachineOperand &NonRegOp) {
2840 Register Reg = RegOp.getReg();
2841 unsigned SubReg = RegOp.getSubReg();
2842 bool IsKill = RegOp.isKill();
2843 bool IsDead = RegOp.isDead();
2844 bool IsUndef = RegOp.isUndef();
2845 bool IsDebug = RegOp.isDebug();
2846
2847 if (NonRegOp.isImm())
2848 RegOp.ChangeToImmediate(NonRegOp.getImm());
2849 else if (NonRegOp.isFI())
2850 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2851 else if (NonRegOp.isGlobal()) {
2852 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2853 NonRegOp.getTargetFlags());
2854 } else
2855 return nullptr;
2856
2857 // Make sure we don't reinterpret a subreg index in the target flags.
2858 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2859
2860 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2861 NonRegOp.setSubReg(SubReg);
2862
2863 return &MI;
2864}
2865
2867 MachineOperand &NonRegOp1,
2868 MachineOperand &NonRegOp2) {
2869 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2870 int64_t NonRegVal = NonRegOp1.getImm();
2871
2872 NonRegOp1.setImm(NonRegOp2.getImm());
2873 NonRegOp2.setImm(NonRegVal);
2874 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2875 NonRegOp2.setTargetFlags(TargetFlags);
2876 return &MI;
2877}
2878
2879bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2880 unsigned OpIdx1) const {
2881 const MCInstrDesc &InstDesc = MI.getDesc();
2882 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2883 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2884
2885 unsigned Opc = MI.getOpcode();
2886 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2887
2888 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2889 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2890
2891 // Swap doesn't breach constant bus or literal limits
2892 // It may move literal to position other than src0, this is not allowed
2893 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2894 // FIXME: After gfx9, literal can be in place other than Src0
2895 if (isVALU(MI)) {
2896 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2897 !isInlineConstant(MO0, OpInfo1))
2898 return false;
2899 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2900 !isInlineConstant(MO1, OpInfo0))
2901 return false;
2902 }
2903
2904 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2905 if (OpInfo1.RegClass == -1)
2906 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2907 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2908 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2909 }
2910 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2911 if (OpInfo0.RegClass == -1)
2912 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2913 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2914 isLegalRegOperand(MI, OpIdx0, MO1);
2915 }
2916
2917 // No need to check 64-bit literals since swapping does not bring new
2918 // 64-bit literals into current instruction to fold to 32-bit
2919
2920 return isImmOperandLegal(MI, OpIdx1, MO0);
2921}
2922
2924 unsigned Src0Idx,
2925 unsigned Src1Idx) const {
2926 assert(!NewMI && "this should never be used");
2927
2928 unsigned Opc = MI.getOpcode();
2929 int CommutedOpcode = commuteOpcode(Opc);
2930 if (CommutedOpcode == -1)
2931 return nullptr;
2932
2933 if (Src0Idx > Src1Idx)
2934 std::swap(Src0Idx, Src1Idx);
2935
2936 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2937 static_cast<int>(Src0Idx) &&
2938 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2939 static_cast<int>(Src1Idx) &&
2940 "inconsistency with findCommutedOpIndices");
2941
2942 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2943 return nullptr;
2944
2945 MachineInstr *CommutedMI = nullptr;
2946 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2947 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2948 if (Src0.isReg() && Src1.isReg()) {
2949 // Be sure to copy the source modifiers to the right place.
2950 CommutedMI =
2951 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2952 } else if (Src0.isReg() && !Src1.isReg()) {
2953 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2954 } else if (!Src0.isReg() && Src1.isReg()) {
2955 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2956 } else if (Src0.isImm() && Src1.isImm()) {
2957 CommutedMI = swapImmOperands(MI, Src0, Src1);
2958 } else {
2959 // FIXME: Found two non registers to commute. This does happen.
2960 return nullptr;
2961 }
2962
2963 if (CommutedMI) {
2964 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2965 Src1, AMDGPU::OpName::src1_modifiers);
2966
2967 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2968 AMDGPU::OpName::src1_sel);
2969
2970 CommutedMI->setDesc(get(CommutedOpcode));
2971 }
2972
2973 return CommutedMI;
2974}
2975
2976// This needs to be implemented because the source modifiers may be inserted
2977// between the true commutable operands, and the base
2978// TargetInstrInfo::commuteInstruction uses it.
2980 unsigned &SrcOpIdx0,
2981 unsigned &SrcOpIdx1) const {
2982 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2983}
2984
2986 unsigned &SrcOpIdx0,
2987 unsigned &SrcOpIdx1) const {
2988 if (!Desc.isCommutable())
2989 return false;
2990
2991 unsigned Opc = Desc.getOpcode();
2992 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2993 if (Src0Idx == -1)
2994 return false;
2995
2996 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2997 if (Src1Idx == -1)
2998 return false;
2999
3000 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
3001}
3002
3004 int64_t BrOffset) const {
3005 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
3006 // because its dest block is unanalyzable.
3007 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
3008
3009 // Convert to dwords.
3010 BrOffset /= 4;
3011
3012 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
3013 // from the next instruction.
3014 BrOffset -= 1;
3015
3016 return isIntN(BranchOffsetBits, BrOffset);
3017}
3018
3021 return MI.getOperand(0).getMBB();
3022}
3023
3025 for (const MachineInstr &MI : MBB->terminators()) {
3026 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
3027 MI.getOpcode() == AMDGPU::SI_LOOP)
3028 return true;
3029 }
3030 return false;
3031}
3032
3034 MachineBasicBlock &DestBB,
3035 MachineBasicBlock &RestoreBB,
3036 const DebugLoc &DL, int64_t BrOffset,
3037 RegScavenger *RS) const {
3038 assert(MBB.empty() &&
3039 "new block should be inserted for expanding unconditional branch");
3040 assert(MBB.pred_size() == 1);
3041 assert(RestoreBB.empty() &&
3042 "restore block should be inserted for restoring clobbered registers");
3043
3044 MachineFunction *MF = MBB.getParent();
3045 MachineRegisterInfo &MRI = MF->getRegInfo();
3047 auto I = MBB.end();
3048 auto &MCCtx = MF->getContext();
3049
3050 if (ST.useAddPC64Inst()) {
3051 MCSymbol *Offset =
3052 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
3053 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
3055 MCSymbol *PostAddPCLabel =
3056 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
3057 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
3058 auto *OffsetExpr = MCBinaryExpr::createSub(
3059 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
3060 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
3061 Offset->setVariableValue(OffsetExpr);
3062 return;
3063 }
3064
3065 assert(RS && "RegScavenger required for long branching");
3066
3067 // FIXME: Virtual register workaround for RegScavenger not working with empty
3068 // blocks.
3069 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3070
3071 // Note: as this is used after hazard recognizer we need to apply some hazard
3072 // workarounds directly.
3073 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
3074 ST.hasVALUReadSGPRHazard();
3075 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
3076 if (FlushSGPRWrites)
3077 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
3079 };
3080
3081 // We need to compute the offset relative to the instruction immediately after
3082 // s_getpc_b64. Insert pc arithmetic code before last terminator.
3083 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
3084 ApplyHazardWorkarounds();
3085
3086 MCSymbol *PostGetPCLabel =
3087 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
3088 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
3089
3090 MCSymbol *OffsetLo =
3091 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
3092 MCSymbol *OffsetHi =
3093 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
3094 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
3095 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
3096 .addReg(PCReg, {}, AMDGPU::sub0)
3097 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
3098 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
3099 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
3100 .addReg(PCReg, {}, AMDGPU::sub1)
3101 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
3102 ApplyHazardWorkarounds();
3103
3104 // Insert the indirect branch after the other terminator.
3105 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
3106 .addReg(PCReg);
3107
3108 // If a spill is needed for the pc register pair, we need to insert a spill
3109 // restore block right before the destination block, and insert a short branch
3110 // into the old destination block's fallthrough predecessor.
3111 // e.g.:
3112 //
3113 // s_cbranch_scc0 skip_long_branch:
3114 //
3115 // long_branch_bb:
3116 // spill s[8:9]
3117 // s_getpc_b64 s[8:9]
3118 // s_add_u32 s8, s8, restore_bb
3119 // s_addc_u32 s9, s9, 0
3120 // s_setpc_b64 s[8:9]
3121 //
3122 // skip_long_branch:
3123 // foo;
3124 //
3125 // .....
3126 //
3127 // dest_bb_fallthrough_predecessor:
3128 // bar;
3129 // s_branch dest_bb
3130 //
3131 // restore_bb:
3132 // restore s[8:9]
3133 // fallthrough dest_bb
3134 ///
3135 // dest_bb:
3136 // buzz;
3137
3138 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3139 Register Scav;
3140
3141 // If we've previously reserved a register for long branches
3142 // avoid running the scavenger and just use those registers
3143 if (LongBranchReservedReg) {
3144 RS->enterBasicBlock(MBB);
3145 Scav = LongBranchReservedReg;
3146 } else {
3147 RS->enterBasicBlockEnd(MBB);
3148 Scav = RS->scavengeRegisterBackwards(
3149 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3150 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3151 }
3152 if (Scav) {
3153 RS->setRegUsed(Scav);
3154 MRI.replaceRegWith(PCReg, Scav);
3155 MRI.clearVirtRegs();
3156 } else {
3157 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3158 // SGPR spill.
3159 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3160 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3161 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3162 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3163 MRI.clearVirtRegs();
3164 }
3165
3166 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3167 // Now, the distance could be defined.
3169 MCSymbolRefExpr::create(DestLabel, MCCtx),
3170 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3171 // Add offset assignments.
3172 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3173 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3174 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3175 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3176}
3177
3178unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3179 switch (Cond) {
3180 case SIInstrInfo::SCC_TRUE:
3181 return AMDGPU::S_CBRANCH_SCC1;
3182 case SIInstrInfo::SCC_FALSE:
3183 return AMDGPU::S_CBRANCH_SCC0;
3184 case SIInstrInfo::VCCNZ:
3185 return AMDGPU::S_CBRANCH_VCCNZ;
3186 case SIInstrInfo::VCCZ:
3187 return AMDGPU::S_CBRANCH_VCCZ;
3188 case SIInstrInfo::EXECNZ:
3189 return AMDGPU::S_CBRANCH_EXECNZ;
3190 case SIInstrInfo::EXECZ:
3191 return AMDGPU::S_CBRANCH_EXECZ;
3192 default:
3193 llvm_unreachable("invalid branch predicate");
3194 }
3195}
3196
3197SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3198 switch (Opcode) {
3199 case AMDGPU::S_CBRANCH_SCC0:
3200 return SCC_FALSE;
3201 case AMDGPU::S_CBRANCH_SCC1:
3202 return SCC_TRUE;
3203 case AMDGPU::S_CBRANCH_VCCNZ:
3204 return VCCNZ;
3205 case AMDGPU::S_CBRANCH_VCCZ:
3206 return VCCZ;
3207 case AMDGPU::S_CBRANCH_EXECNZ:
3208 return EXECNZ;
3209 case AMDGPU::S_CBRANCH_EXECZ:
3210 return EXECZ;
3211 default:
3212 return INVALID_BR;
3213 }
3214}
3215
3219 MachineBasicBlock *&FBB,
3221 bool AllowModify) const {
3222 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3223 // Unconditional Branch
3224 TBB = I->getOperand(0).getMBB();
3225 return false;
3226 }
3227
3228 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3229 if (Pred == INVALID_BR)
3230 return true;
3231
3232 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3233 Cond.push_back(MachineOperand::CreateImm(Pred));
3234 Cond.push_back(I->getOperand(1)); // Save the branch register.
3235
3236 ++I;
3237
3238 if (I == MBB.end()) {
3239 // Conditional branch followed by fall-through.
3240 TBB = CondBB;
3241 return false;
3242 }
3243
3244 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3245 TBB = CondBB;
3246 FBB = I->getOperand(0).getMBB();
3247 return false;
3248 }
3249
3250 return true;
3251}
3252
3254 MachineBasicBlock *&FBB,
3256 bool AllowModify) const {
3257 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3258 auto E = MBB.end();
3259 if (I == E)
3260 return false;
3261
3262 // Skip over the instructions that are artificially terminators for special
3263 // exec management.
3264 while (I != E && !I->isBranch() && !I->isReturn()) {
3265 switch (I->getOpcode()) {
3266 case AMDGPU::S_MOV_B64_term:
3267 case AMDGPU::S_XOR_B64_term:
3268 case AMDGPU::S_OR_B64_term:
3269 case AMDGPU::S_ANDN2_B64_term:
3270 case AMDGPU::S_AND_B64_term:
3271 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3272 case AMDGPU::S_MOV_B32_term:
3273 case AMDGPU::S_XOR_B32_term:
3274 case AMDGPU::S_OR_B32_term:
3275 case AMDGPU::S_ANDN2_B32_term:
3276 case AMDGPU::S_AND_B32_term:
3277 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3278 break;
3279 case AMDGPU::SI_IF:
3280 case AMDGPU::SI_ELSE:
3281 case AMDGPU::SI_KILL_I1_TERMINATOR:
3282 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3283 // FIXME: It's messy that these need to be considered here at all.
3284 return true;
3285 default:
3286 llvm_unreachable("unexpected non-branch terminator inst");
3287 }
3288
3289 ++I;
3290 }
3291
3292 if (I == E)
3293 return false;
3294
3295 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3296}
3297
3299 int *BytesRemoved) const {
3300 unsigned Count = 0;
3301 unsigned RemovedSize = 0;
3302 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3303 // Skip over artificial terminators when removing instructions.
3304 if (MI.isBranch() || MI.isReturn()) {
3305 RemovedSize += getInstSizeInBytes(MI);
3306 MI.eraseFromParent();
3307 ++Count;
3308 }
3309 }
3310
3311 if (BytesRemoved)
3312 *BytesRemoved = RemovedSize;
3313
3314 return Count;
3315}
3316
3317// Copy the flags onto the implicit condition register operand.
3319 const MachineOperand &OrigCond) {
3320 CondReg.setIsUndef(OrigCond.isUndef());
3321 CondReg.setIsKill(OrigCond.isKill());
3322}
3323
3326 MachineBasicBlock *FBB,
3328 const DebugLoc &DL,
3329 int *BytesAdded) const {
3330 if (!FBB && Cond.empty()) {
3331 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3332 .addMBB(TBB);
3333 if (BytesAdded)
3334 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3335 return 1;
3336 }
3337
3338 assert(TBB && Cond[0].isImm());
3339
3340 unsigned Opcode
3341 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3342
3343 if (!FBB) {
3344 MachineInstr *CondBr =
3345 BuildMI(&MBB, DL, get(Opcode))
3346 .addMBB(TBB);
3347
3348 // Copy the flags onto the implicit condition register operand.
3349 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3350 fixImplicitOperands(*CondBr);
3351
3352 if (BytesAdded)
3353 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3354 return 1;
3355 }
3356
3357 assert(TBB && FBB);
3358
3359 MachineInstr *CondBr =
3360 BuildMI(&MBB, DL, get(Opcode))
3361 .addMBB(TBB);
3362 fixImplicitOperands(*CondBr);
3363 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3364 .addMBB(FBB);
3365
3366 MachineOperand &CondReg = CondBr->getOperand(1);
3367 CondReg.setIsUndef(Cond[1].isUndef());
3368 CondReg.setIsKill(Cond[1].isKill());
3369
3370 if (BytesAdded)
3371 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3372
3373 return 2;
3374}
3375
3378 if (Cond.size() != 2) {
3379 return true;
3380 }
3381
3382 if (Cond[0].isImm()) {
3383 Cond[0].setImm(-Cond[0].getImm());
3384 return false;
3385 }
3386
3387 return true;
3388}
3389
3392 Register DstReg, Register TrueReg,
3393 Register FalseReg, int &CondCycles,
3394 int &TrueCycles, int &FalseCycles) const {
3395 switch (Cond[0].getImm()) {
3396 case VCCNZ:
3397 case VCCZ: {
3398 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3399 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3400 if (MRI.getRegClass(FalseReg) != RC)
3401 return false;
3402
3403 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3404 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3405
3406 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3407 return RI.hasVGPRs(RC) && NumInsts <= 6;
3408 }
3409 case SCC_TRUE:
3410 case SCC_FALSE: {
3411 // FIXME: We could insert for VGPRs if we could replace the original compare
3412 // with a vector one.
3413 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3414 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3415 if (MRI.getRegClass(FalseReg) != RC)
3416 return false;
3417
3418 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3419
3420 // Multiples of 8 can do s_cselect_b64
3421 if (NumInsts % 2 == 0)
3422 NumInsts /= 2;
3423
3424 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3425 return RI.isSGPRClass(RC);
3426 }
3427 default:
3428 return false;
3429 }
3430}
3431
3435 Register TrueReg, Register FalseReg) const {
3436 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3437 if (Pred == VCCZ || Pred == SCC_FALSE) {
3438 Pred = static_cast<BranchPredicate>(-Pred);
3439 std::swap(TrueReg, FalseReg);
3440 }
3441
3442 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3443 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3444 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3445
3446 if (DstSize == 32) {
3448 if (Pred == SCC_TRUE) {
3449 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3450 .addReg(TrueReg)
3451 .addReg(FalseReg);
3452 } else {
3453 // Instruction's operands are backwards from what is expected.
3454 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3455 .addReg(FalseReg)
3456 .addReg(TrueReg);
3457 }
3458
3459 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3460 return;
3461 }
3462
3463 if (DstSize == 64 && Pred == SCC_TRUE) {
3465 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3466 .addReg(TrueReg)
3467 .addReg(FalseReg);
3468
3469 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3470 return;
3471 }
3472
3473 static const int16_t Sub0_15[] = {
3474 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3475 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3476 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3477 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3478 };
3479
3480 static const int16_t Sub0_15_64[] = {
3481 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3482 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3483 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3484 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3485 };
3486
3487 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3488 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3489 const int16_t *SubIndices = Sub0_15;
3490 int NElts = DstSize / 32;
3491
3492 // 64-bit select is only available for SALU.
3493 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3494 if (Pred == SCC_TRUE) {
3495 if (NElts % 2) {
3496 SelOp = AMDGPU::S_CSELECT_B32;
3497 EltRC = &AMDGPU::SGPR_32RegClass;
3498 } else {
3499 SelOp = AMDGPU::S_CSELECT_B64;
3500 EltRC = &AMDGPU::SGPR_64RegClass;
3501 SubIndices = Sub0_15_64;
3502 NElts /= 2;
3503 }
3504 }
3505
3507 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3508
3509 I = MIB->getIterator();
3510
3512 for (int Idx = 0; Idx != NElts; ++Idx) {
3513 Register DstElt = MRI.createVirtualRegister(EltRC);
3514 Regs.push_back(DstElt);
3515
3516 unsigned SubIdx = SubIndices[Idx];
3517
3519 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3520 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3521 .addReg(FalseReg, {}, SubIdx)
3522 .addReg(TrueReg, {}, SubIdx);
3523 } else {
3524 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3525 .addReg(TrueReg, {}, SubIdx)
3526 .addReg(FalseReg, {}, SubIdx);
3527 }
3528
3529 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3531
3532 MIB.addReg(DstElt)
3533 .addImm(SubIdx);
3534 }
3535}
3536
3538
3539 if (MI.isBranch() || MI.isCall() || MI.isReturn() || MI.isIndirectBranch())
3540 return true;
3541
3542 switch (MI.getOpcode()) {
3543 case AMDGPU::S_ENDPGM:
3544 case AMDGPU::S_ENDPGM_SAVED:
3545 case AMDGPU::S_TRAP:
3546 case AMDGPU::S_GETREG_B32:
3547 case AMDGPU::S_SETREG_B32:
3548 case AMDGPU::S_SETREG_B32_mode:
3549 case AMDGPU::S_SETREG_IMM32_B32:
3550 case AMDGPU::S_SETREG_IMM32_B32_mode:
3551 case AMDGPU::S_SENDMSG:
3552 case AMDGPU::S_SENDMSGHALT:
3553 case AMDGPU::S_SENDMSG_RTN_B32:
3554 case AMDGPU::S_SENDMSG_RTN_B64:
3555 case AMDGPU::S_BARRIER_WAIT:
3556 case AMDGPU::S_BARRIER_SIGNAL_M0:
3557 case AMDGPU::S_BARRIER_SIGNAL_IMM:
3558 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
3559 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
3560 return true;
3561 default:
3562 return false;
3563 }
3564}
3565
3567 switch (MI.getOpcode()) {
3568 case AMDGPU::V_MOV_B16_t16_e32:
3569 case AMDGPU::V_MOV_B16_t16_e64:
3570 case AMDGPU::V_MOV_B32_e32:
3571 case AMDGPU::V_MOV_B32_e64:
3572 case AMDGPU::V_MOV_B64_PSEUDO:
3573 case AMDGPU::V_MOV_B64_e32:
3574 case AMDGPU::V_MOV_B64_e64:
3575 case AMDGPU::S_MOV_B32:
3576 case AMDGPU::S_MOV_B64:
3577 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3578 case AMDGPU::COPY:
3579 case AMDGPU::WWM_COPY:
3580 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3581 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3582 case AMDGPU::V_ACCVGPR_MOV_B32:
3583 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3584 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3585 return true;
3586 default:
3587 return false;
3588 }
3589}
3590
3592 switch (MI.getOpcode()) {
3593 case AMDGPU::V_MOV_B16_t16_e32:
3594 case AMDGPU::V_MOV_B16_t16_e64:
3595 return 2;
3596 case AMDGPU::V_MOV_B32_e32:
3597 case AMDGPU::V_MOV_B32_e64:
3598 case AMDGPU::V_MOV_B64_PSEUDO:
3599 case AMDGPU::V_MOV_B64_e32:
3600 case AMDGPU::V_MOV_B64_e64:
3601 case AMDGPU::S_MOV_B32:
3602 case AMDGPU::S_MOV_B64:
3603 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3604 case AMDGPU::COPY:
3605 case AMDGPU::WWM_COPY:
3606 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3607 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3608 case AMDGPU::V_ACCVGPR_MOV_B32:
3609 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3610 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3611 return 1;
3612 default:
3613 llvm_unreachable("MI is not a foldable copy");
3614 }
3615}
3616
3617static constexpr AMDGPU::OpName ModifierOpNames[] = {
3618 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3619 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3620 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3621
3623 unsigned Opc = MI.getOpcode();
3624 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3625 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3626 if (Idx >= 0)
3627 MI.removeOperand(Idx);
3628 }
3629}
3630
3632 const MCInstrDesc &NewDesc) const {
3633 MI.setDesc(NewDesc);
3634
3635 // Remove any leftover implicit operands from mutating the instruction. e.g.
3636 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3637 // anymore.
3638 const MCInstrDesc &Desc = MI.getDesc();
3639 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3640 Desc.implicit_defs().size();
3641
3642 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3643 MI.removeOperand(I);
3644}
3645
3646std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3647 unsigned SubRegIndex) {
3648 switch (SubRegIndex) {
3649 case AMDGPU::NoSubRegister:
3650 return Imm;
3651 case AMDGPU::sub0:
3652 return SignExtend64<32>(Imm);
3653 case AMDGPU::sub1:
3654 return SignExtend64<32>(Imm >> 32);
3655 case AMDGPU::lo16:
3656 return SignExtend64<16>(Imm);
3657 case AMDGPU::hi16:
3658 return SignExtend64<16>(Imm >> 16);
3659 case AMDGPU::sub1_lo16:
3660 return SignExtend64<16>(Imm >> 32);
3661 case AMDGPU::sub1_hi16:
3662 return SignExtend64<16>(Imm >> 48);
3663 default:
3664 return std::nullopt;
3665 }
3666
3667 llvm_unreachable("covered subregister switch");
3668}
3669
3670static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3671 switch (Opc) {
3672 case AMDGPU::V_MAC_F16_e32:
3673 case AMDGPU::V_MAC_F16_e64:
3674 case AMDGPU::V_MAD_F16_e64:
3675 return AMDGPU::V_MADAK_F16;
3676 case AMDGPU::V_MAC_F32_e32:
3677 case AMDGPU::V_MAC_F32_e64:
3678 case AMDGPU::V_MAD_F32_e64:
3679 return AMDGPU::V_MADAK_F32;
3680 case AMDGPU::V_FMAC_F32_e32:
3681 case AMDGPU::V_FMAC_F32_e64:
3682 case AMDGPU::V_FMA_F32_e64:
3683 return AMDGPU::V_FMAAK_F32;
3684 case AMDGPU::V_FMAC_F16_e32:
3685 case AMDGPU::V_FMAC_F16_e64:
3686 case AMDGPU::V_FMAC_F16_t16_e64:
3687 case AMDGPU::V_FMAC_F16_fake16_e64:
3688 case AMDGPU::V_FMAC_F16_t16_e32:
3689 case AMDGPU::V_FMAC_F16_fake16_e32:
3690 case AMDGPU::V_FMA_F16_e64:
3691 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3692 ? AMDGPU::V_FMAAK_F16_t16
3693 : AMDGPU::V_FMAAK_F16_fake16
3694 : AMDGPU::V_FMAAK_F16;
3695 case AMDGPU::V_FMAC_F64_e32:
3696 case AMDGPU::V_FMAC_F64_e64:
3697 case AMDGPU::V_FMA_F64_e64:
3698 return AMDGPU::V_FMAAK_F64;
3699 default:
3700 llvm_unreachable("invalid instruction");
3701 }
3702}
3703
3704static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3705 switch (Opc) {
3706 case AMDGPU::V_MAC_F16_e32:
3707 case AMDGPU::V_MAC_F16_e64:
3708 case AMDGPU::V_MAD_F16_e64:
3709 return AMDGPU::V_MADMK_F16;
3710 case AMDGPU::V_MAC_F32_e32:
3711 case AMDGPU::V_MAC_F32_e64:
3712 case AMDGPU::V_MAD_F32_e64:
3713 return AMDGPU::V_MADMK_F32;
3714 case AMDGPU::V_FMAC_F32_e32:
3715 case AMDGPU::V_FMAC_F32_e64:
3716 case AMDGPU::V_FMA_F32_e64:
3717 return AMDGPU::V_FMAMK_F32;
3718 case AMDGPU::V_FMAC_F16_e32:
3719 case AMDGPU::V_FMAC_F16_e64:
3720 case AMDGPU::V_FMAC_F16_t16_e64:
3721 case AMDGPU::V_FMAC_F16_fake16_e64:
3722 case AMDGPU::V_FMAC_F16_t16_e32:
3723 case AMDGPU::V_FMAC_F16_fake16_e32:
3724 case AMDGPU::V_FMA_F16_e64:
3725 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3726 ? AMDGPU::V_FMAMK_F16_t16
3727 : AMDGPU::V_FMAMK_F16_fake16
3728 : AMDGPU::V_FMAMK_F16;
3729 case AMDGPU::V_FMAC_F64_e32:
3730 case AMDGPU::V_FMAC_F64_e64:
3731 case AMDGPU::V_FMA_F64_e64:
3732 return AMDGPU::V_FMAMK_F64;
3733 default:
3734 llvm_unreachable("invalid instruction");
3735 }
3736}
3737
3739 Register Reg, MachineRegisterInfo *MRI) const {
3740 int64_t Imm;
3741 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3742 return false;
3743
3744 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3745
3746 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3747
3748 unsigned Opc = UseMI.getOpcode();
3749 if (Opc == AMDGPU::COPY) {
3750 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3751
3752 Register DstReg = UseMI.getOperand(0).getReg();
3753 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3754
3755 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3756
3757 if (HasMultipleUses) {
3758 // TODO: This should fold in more cases with multiple use, but we need to
3759 // more carefully consider what those uses are.
3760 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3761
3762 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3763 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3764 return false;
3765
3766 // Most of the time folding a 32-bit inline constant is free (though this
3767 // might not be true if we can't later fold it into a real user).
3768 //
3769 // FIXME: This isInlineConstant check is imprecise if
3770 // getConstValDefinedInReg handled the tricky non-mov cases.
3771 if (ImmDefSize == 32 &&
3773 return false;
3774 }
3775
3776 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3777 RI.getSubRegIdxSize(UseSubReg) == 16;
3778
3779 if (Is16Bit) {
3780 if (RI.hasVGPRs(DstRC))
3781 return false; // Do not clobber vgpr_hi16
3782
3783 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3784 return false;
3785 }
3786
3787 MachineFunction *MF = UseMI.getMF();
3788
3789 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3790 MCRegister MovDstPhysReg =
3791 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3792
3793 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3794
3795 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3796 for (unsigned MovOp :
3797 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3798 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3799 const MCInstrDesc &MovDesc = get(MovOp);
3800
3801 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3802 if (Is16Bit) {
3803 // We just need to find a correctly sized register class, so the
3804 // subregister index compatibility doesn't matter since we're statically
3805 // extracting the immediate value.
3806 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3807 if (!MovDstRC)
3808 continue;
3809
3810 if (MovDstPhysReg) {
3811 // FIXME: We probably should not do this. If there is a live value in
3812 // the high half of the register, it will be corrupted.
3813 MovDstPhysReg =
3814 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3815 if (!MovDstPhysReg)
3816 continue;
3817 }
3818 }
3819
3820 // Result class isn't the right size, try the next instruction.
3821 if (MovDstPhysReg) {
3822 if (!MovDstRC->contains(MovDstPhysReg))
3823 return false;
3824 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3825 // TODO: This will be overly conservative in the case of 16-bit virtual
3826 // SGPRs. We could hack up the virtual register uses to use a compatible
3827 // 32-bit class.
3828 continue;
3829 }
3830
3831 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3832
3833 // Ensure the interpreted immediate value is a valid operand in the new
3834 // mov.
3835 //
3836 // FIXME: isImmOperandLegal should have form that doesn't require existing
3837 // MachineInstr or MachineOperand
3838 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3839 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3840 break;
3841
3842 NewOpc = MovOp;
3843 break;
3844 }
3845
3846 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3847 return false;
3848
3849 if (Is16Bit) {
3850 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3851 if (MovDstPhysReg)
3852 UseMI.getOperand(0).setReg(MovDstPhysReg);
3853 assert(UseMI.getOperand(1).getReg().isVirtual());
3854 }
3855
3856 const MCInstrDesc &NewMCID = get(NewOpc);
3857 UseMI.setDesc(NewMCID);
3858 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3859 UseMI.addImplicitDefUseOperands(*MF);
3860 return true;
3861 }
3862
3863 if (HasMultipleUses)
3864 return false;
3865
3866 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3867 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3868 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3869 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3870 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3871 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3872 Opc == AMDGPU::V_FMAC_F64_e64) {
3873 // Don't fold if we are using source or output modifiers. The new VOP2
3874 // instructions don't have them.
3876 return false;
3877
3878 // If this is a free constant, there's no reason to do this.
3879 // TODO: We could fold this here instead of letting SIFoldOperands do it
3880 // later.
3881 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3882
3883 // Any src operand can be used for the legality check.
3884 if (isInlineConstant(UseMI, Src0Idx, Imm))
3885 return false;
3886
3887 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3888
3889 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3890 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3891
3892 auto CopyRegOperandToNarrowerRC =
3893 [MRI, this](MachineInstr &MI, unsigned OpNo,
3894 const TargetRegisterClass *NewRC) -> void {
3895 if (!MI.getOperand(OpNo).isReg())
3896 return;
3897 Register Reg = MI.getOperand(OpNo).getReg();
3898 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3899 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3900 return;
3901 Register Tmp = MRI->createVirtualRegister(NewRC);
3902 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3903 get(AMDGPU::COPY), Tmp)
3904 .addReg(Reg);
3905 MI.getOperand(OpNo).setReg(Tmp);
3906 MI.getOperand(OpNo).setIsKill();
3907 };
3908
3909 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3910 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3911 (Src1->isReg() && Src1->getReg() == Reg)) {
3912 MachineOperand *RegSrc =
3913 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3914 if (!RegSrc->isReg())
3915 return false;
3916 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3917 ST.getConstantBusLimit(Opc) < 2)
3918 return false;
3919
3920 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3921 return false;
3922
3923 // If src2 is also a literal constant then we have to choose which one to
3924 // fold. In general it is better to choose madak so that the other literal
3925 // can be materialized in an sgpr instead of a vgpr:
3926 // s_mov_b32 s0, literal
3927 // v_madak_f32 v0, s0, v0, literal
3928 // Instead of:
3929 // v_mov_b32 v1, literal
3930 // v_madmk_f32 v0, v0, literal, v1
3931 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3932 if (Def && Def->isMoveImmediate() &&
3933 !isInlineConstant(Def->getOperand(1)))
3934 return false;
3935
3936 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3937 if (pseudoToMCOpcode(NewOpc) == -1)
3938 return false;
3939
3940 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3941 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3942
3943 // FIXME: This would be a lot easier if we could return a new instruction
3944 // instead of having to modify in place.
3945
3946 Register SrcReg = RegSrc->getReg();
3947 unsigned SrcSubReg = RegSrc->getSubReg();
3948 Src0->setReg(SrcReg);
3949 Src0->setSubReg(SrcSubReg);
3950 Src0->setIsKill(RegSrc->isKill());
3951
3952 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3953 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3954 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3955 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3956 UseMI.untieRegOperand(
3957 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3958
3959 Src1->ChangeToImmediate(*SubRegImm);
3960
3962 UseMI.setDesc(get(NewOpc));
3963
3964 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3965 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3966 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3967 Register Tmp = MRI->createVirtualRegister(NewRC);
3968 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3969 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3970 UseMI.getOperand(0).getReg())
3971 .addReg(Tmp, RegState::Kill);
3972 UseMI.getOperand(0).setReg(Tmp);
3973 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3974 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3975 }
3976
3977 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3978 if (DeleteDef)
3979 DefMI.eraseFromParent();
3980
3981 return true;
3982 }
3983
3984 // Added part is the constant: Use v_madak_{f16, f32}.
3985 if (Src2->isReg() && Src2->getReg() == Reg) {
3986 if (ST.getConstantBusLimit(Opc) < 2) {
3987 // Not allowed to use constant bus for another operand.
3988 // We can however allow an inline immediate as src0.
3989 bool Src0Inlined = false;
3990 if (Src0->isReg()) {
3991 // Try to inline constant if possible.
3992 // If the Def moves immediate and the use is single
3993 // We are saving VGPR here.
3994 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3995 if (Def && Def->isMoveImmediate() &&
3996 isInlineConstant(Def->getOperand(1)) &&
3997 MRI->hasOneNonDBGUse(Src0->getReg())) {
3998 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3999 Src0Inlined = true;
4000 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
4001 RI.isSGPRReg(*MRI, Src0->getReg())) {
4002 return false;
4003 }
4004 // VGPR is okay as Src0 - fallthrough
4005 }
4006
4007 if (Src1->isReg() && !Src0Inlined) {
4008 // We have one slot for inlinable constant so far - try to fill it
4009 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
4010 if (Def && Def->isMoveImmediate() &&
4011 isInlineConstant(Def->getOperand(1)) &&
4012 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
4013 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
4014 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
4015 return false;
4016 // VGPR is okay as Src1 - fallthrough
4017 }
4018 }
4019
4020 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4021 if (pseudoToMCOpcode(NewOpc) == -1)
4022 return false;
4023
4024 // FIXME: This would be a lot easier if we could return a new instruction
4025 // instead of having to modify in place.
4026
4027 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
4028 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
4029 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
4030 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
4031 UseMI.untieRegOperand(
4032 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
4033
4034 const std::optional<int64_t> SubRegImm =
4035 extractSubregFromImm(Imm, Src2->getSubReg());
4036
4037 // ChangingToImmediate adds Src2 back to the instruction.
4038 Src2->ChangeToImmediate(*SubRegImm);
4039
4040 // These come before src2.
4042 UseMI.setDesc(get(NewOpc));
4043
4044 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
4045 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
4046 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
4047 Register Tmp = MRI->createVirtualRegister(NewRC);
4048 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
4049 UseMI.getDebugLoc(), get(AMDGPU::COPY),
4050 UseMI.getOperand(0).getReg())
4051 .addReg(Tmp, RegState::Kill);
4052 UseMI.getOperand(0).setReg(Tmp);
4053 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
4054 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
4055 }
4056
4057 // It might happen that UseMI was commuted
4058 // and we now have SGPR as SRC1. If so 2 inlined
4059 // constant and SGPR are illegal.
4061
4062 bool DeleteDef = MRI->use_nodbg_empty(Reg);
4063 if (DeleteDef)
4064 DefMI.eraseFromParent();
4065
4066 return true;
4067 }
4068 }
4069
4070 return false;
4071}
4072
4073static bool
4076 if (BaseOps1.size() != BaseOps2.size())
4077 return false;
4078 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
4079 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
4080 return false;
4081 }
4082 return true;
4083}
4084
4085static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
4086 LocationSize WidthB, int OffsetB) {
4087 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
4088 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
4089 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
4090 return LowWidth.hasValue() &&
4091 LowOffset + (int)LowWidth.getValue() <= HighOffset;
4092}
4093
4094bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
4095 const MachineInstr &MIb) const {
4096 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
4097 int64_t Offset0, Offset1;
4098 LocationSize Dummy0 = LocationSize::precise(0);
4099 LocationSize Dummy1 = LocationSize::precise(0);
4100 bool Offset0IsScalable, Offset1IsScalable;
4101 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
4102 Dummy0, &RI) ||
4103 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
4104 Dummy1, &RI))
4105 return false;
4106
4107 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
4108 return false;
4109
4110 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
4111 // FIXME: Handle ds_read2 / ds_write2.
4112 return false;
4113 }
4114 LocationSize Width0 = MIa.memoperands().front()->getSize();
4115 LocationSize Width1 = MIb.memoperands().front()->getSize();
4116 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
4117}
4118
4120 const MachineInstr &MIb) const {
4121 assert(MIa.mayLoadOrStore() &&
4122 "MIa must load from or modify a memory location");
4123 assert(MIb.mayLoadOrStore() &&
4124 "MIb must load from or modify a memory location");
4125
4127 return false;
4128
4129 // XXX - Can we relax this between address spaces?
4130 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4131 return false;
4132
4133 if (isLDSDMA(MIa) || isLDSDMA(MIb))
4134 return false;
4135
4136 if (MIa.isBundle() || MIb.isBundle())
4137 return false;
4138
4139 // TODO: Should we check the address space from the MachineMemOperand? That
4140 // would allow us to distinguish objects we know don't alias based on the
4141 // underlying address space, even if it was lowered to a different one,
4142 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4143 // buffer.
4144 if (isDS(MIa)) {
4145 if (isDS(MIb))
4146 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4147
4148 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4149 }
4150
4151 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4152 if (isMUBUF(MIb) || isMTBUF(MIb))
4153 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4154
4155 if (isFLAT(MIb))
4156 return isFLATScratch(MIb);
4157
4158 return !isSMRD(MIb);
4159 }
4160
4161 if (isSMRD(MIa)) {
4162 if (isSMRD(MIb))
4163 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4164
4165 if (isFLAT(MIb))
4166 return isFLATScratch(MIb);
4167
4168 return !isMUBUF(MIb) && !isMTBUF(MIb);
4169 }
4170
4171 if (isFLAT(MIa)) {
4172 if (isFLAT(MIb)) {
4173 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4174 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4175 return true;
4176
4177 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4178 }
4179
4180 return false;
4181 }
4182
4183 return false;
4184}
4185
4187 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4188 if (Reg.isPhysical())
4189 return false;
4190 auto *Def = MRI.getUniqueVRegDef(Reg);
4191 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4192 Imm = Def->getOperand(1).getImm();
4193 if (DefMI)
4194 *DefMI = Def;
4195 return true;
4196 }
4197 return false;
4198}
4199
4200static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4201 MachineInstr **DefMI = nullptr) {
4202 if (!MO->isReg())
4203 return false;
4204 const MachineFunction *MF = MO->getParent()->getMF();
4205 const MachineRegisterInfo &MRI = MF->getRegInfo();
4206 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4207}
4208
4210 MachineInstr &NewMI) {
4211 if (LV) {
4212 unsigned NumOps = MI.getNumOperands();
4213 for (unsigned I = 1; I < NumOps; ++I) {
4214 MachineOperand &Op = MI.getOperand(I);
4215 if (Op.isReg() && Op.isKill())
4216 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4217 }
4218 }
4219}
4220
4221static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4222 switch (Opc) {
4223 case AMDGPU::V_MAC_F16_e32:
4224 case AMDGPU::V_MAC_F16_e64:
4225 return AMDGPU::V_MAD_F16_e64;
4226 case AMDGPU::V_MAC_F32_e32:
4227 case AMDGPU::V_MAC_F32_e64:
4228 return AMDGPU::V_MAD_F32_e64;
4229 case AMDGPU::V_MAC_LEGACY_F32_e32:
4230 case AMDGPU::V_MAC_LEGACY_F32_e64:
4231 return AMDGPU::V_MAD_LEGACY_F32_e64;
4232 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4233 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4234 return AMDGPU::V_FMA_LEGACY_F32_e64;
4235 case AMDGPU::V_FMAC_F16_e32:
4236 case AMDGPU::V_FMAC_F16_e64:
4237 case AMDGPU::V_FMAC_F16_t16_e64:
4238 case AMDGPU::V_FMAC_F16_fake16_e64:
4239 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4240 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4241 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4242 : AMDGPU::V_FMA_F16_gfx9_e64;
4243 case AMDGPU::V_FMAC_F32_e32:
4244 case AMDGPU::V_FMAC_F32_e64:
4245 return AMDGPU::V_FMA_F32_e64;
4246 case AMDGPU::V_FMAC_F64_e32:
4247 case AMDGPU::V_FMAC_F64_e64:
4248 return AMDGPU::V_FMA_F64_e64;
4249 default:
4250 llvm_unreachable("invalid instruction");
4251 }
4252}
4253
4254/// Helper struct for the implementation of 3-address conversion to communicate
4255/// updates made to instruction operands.
4257 /// Other instruction whose def is no longer used by the converted
4258 /// instruction.
4260};
4261
4263 LiveVariables *LV,
4264 LiveIntervals *LIS) const {
4265 MachineBasicBlock &MBB = *MI.getParent();
4266 MachineInstr *CandidateMI = &MI;
4267
4268 if (MI.isBundle()) {
4269 // This is a temporary placeholder for bundle handling that enables us to
4270 // exercise the relevant code paths in the two-address instruction pass.
4271 if (MI.getBundleSize() != 1)
4272 return nullptr;
4273 CandidateMI = MI.getNextNode();
4274 }
4275
4277 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4278 if (!NewMI)
4279 return nullptr;
4280
4281 if (MI.isBundle()) {
4282 CandidateMI->eraseFromBundle();
4283
4284 for (MachineOperand &MO : MI.all_defs()) {
4285 if (MO.isTied())
4286 MI.untieRegOperand(MO.getOperandNo());
4287 }
4288 } else {
4289 updateLiveVariables(LV, MI, *NewMI);
4290 if (LIS) {
4291 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4292 // SlotIndex of defs needs to be updated when converting to early-clobber
4293 MachineOperand &Def = NewMI->getOperand(0);
4294 if (Def.isEarlyClobber() && Def.isReg() &&
4295 LIS->hasInterval(Def.getReg())) {
4296 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4297 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4298 auto &LI = LIS->getInterval(Def.getReg());
4299 auto UpdateDefIndex = [&](LiveRange &LR) {
4300 auto *S = LR.find(OldIndex);
4301 if (S != LR.end() && S->start == OldIndex) {
4302 assert(S->valno && S->valno->def == OldIndex);
4303 S->start = NewIndex;
4304 S->valno->def = NewIndex;
4305 }
4306 };
4307 UpdateDefIndex(LI);
4308 for (auto &SR : LI.subranges())
4309 UpdateDefIndex(SR);
4310 }
4311 }
4312 }
4313
4314 if (U.RemoveMIUse) {
4315 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4316 // The only user is the instruction which will be killed.
4317 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4318
4319 if (MRI.hasOneNonDBGUse(DefReg)) {
4320 // We cannot just remove the DefMI here, calling pass will crash.
4321 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4322 U.RemoveMIUse->getOperand(0).setIsDead(true);
4323 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4324 U.RemoveMIUse->removeOperand(I);
4325 if (LV)
4326 LV->getVarInfo(DefReg).AliveBlocks.clear();
4327 }
4328
4329 if (MI.isBundle()) {
4330 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4331 if (!VRI.Reads && !VRI.Writes) {
4332 for (MachineOperand &MO : MI.all_uses()) {
4333 if (MO.isReg() && MO.getReg() == DefReg) {
4334 assert(MO.getSubReg() == 0 &&
4335 "tied sub-registers in bundles currently not supported");
4336 MI.removeOperand(MO.getOperandNo());
4337 break;
4338 }
4339 }
4340
4341 if (LIS)
4342 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4343 }
4344 } else if (LIS) {
4345 LiveInterval &DefLI = LIS->getInterval(DefReg);
4346
4347 // We cannot delete the original instruction here, so hack out the use
4348 // in the original instruction with a dummy register so we can use
4349 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4350 // not have the complexity of deleting a use to consider here.
4351 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4352 for (MachineOperand &MIOp : MI.uses()) {
4353 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4354 MIOp.setIsUndef(true);
4355 MIOp.setReg(DummyReg);
4356 }
4357 }
4358
4359 if (MI.isBundle()) {
4360 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4361 if (!VRI.Reads && !VRI.Writes) {
4362 for (MachineOperand &MIOp : MI.uses()) {
4363 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4364 MIOp.setIsUndef(true);
4365 MIOp.setReg(DummyReg);
4366 }
4367 }
4368 }
4369
4370 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4371 false, /*isUndef=*/true));
4372 }
4373
4374 LIS->shrinkToUses(&DefLI);
4375 }
4376 }
4377
4378 return MI.isBundle() ? &MI : NewMI;
4379}
4380
4382SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4383 ThreeAddressUpdates &U) const {
4384 MachineBasicBlock &MBB = *MI.getParent();
4385 unsigned Opc = MI.getOpcode();
4386
4387 // Handle MFMA.
4388 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4389 if (NewMFMAOpc != -1) {
4391 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4392 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4393 MIB.add(MI.getOperand(I));
4394 return MIB;
4395 }
4396
4397 if (SIInstrInfo::isWMMA(MI)) {
4398 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4399 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4400 .setMIFlags(MI.getFlags());
4401 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4402 MIB->addOperand(MI.getOperand(I));
4403 return MIB;
4404 }
4405
4406 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4407 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4408 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4409 "present pre-RA");
4410
4411 // Handle MAC/FMAC.
4412 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4413 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4414 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4415 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4416 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4417 bool Src0Literal = false;
4418
4419 switch (Opc) {
4420 default:
4421 return nullptr;
4422 case AMDGPU::V_MAC_F16_e64:
4423 case AMDGPU::V_FMAC_F16_e64:
4424 case AMDGPU::V_FMAC_F16_t16_e64:
4425 case AMDGPU::V_FMAC_F16_fake16_e64:
4426 case AMDGPU::V_MAC_F32_e64:
4427 case AMDGPU::V_MAC_LEGACY_F32_e64:
4428 case AMDGPU::V_FMAC_F32_e64:
4429 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4430 case AMDGPU::V_FMAC_F64_e64:
4431 break;
4432 case AMDGPU::V_MAC_F16_e32:
4433 case AMDGPU::V_FMAC_F16_e32:
4434 case AMDGPU::V_MAC_F32_e32:
4435 case AMDGPU::V_MAC_LEGACY_F32_e32:
4436 case AMDGPU::V_FMAC_F32_e32:
4437 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4438 case AMDGPU::V_FMAC_F64_e32: {
4439 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4440 AMDGPU::OpName::src0);
4441 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4442 if (!Src0->isReg() && !Src0->isImm())
4443 return nullptr;
4444
4445 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4446 Src0Literal = true;
4447
4448 break;
4449 }
4450 }
4451
4452 MachineInstrBuilder MIB;
4453 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4454 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4455 const MachineOperand *Src0Mods =
4456 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4457 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4458 const MachineOperand *Src1Mods =
4459 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4460 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4461 const MachineOperand *Src2Mods =
4462 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4463 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4464 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4465 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4466
4467 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4468 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4469 // If we have an SGPR input, we will violate the constant bus restriction.
4470 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4471 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4472 MachineInstr *DefMI;
4473
4474 int64_t Imm;
4475 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4476 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4477 if (pseudoToMCOpcode(NewOpc) != -1) {
4478 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4479 .add(*Dst)
4480 .add(*Src0)
4481 .add(*Src1)
4482 .addImm(Imm)
4483 .setMIFlags(MI.getFlags());
4484 U.RemoveMIUse = DefMI;
4485 return MIB;
4486 }
4487 }
4488 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4489 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4490 if (pseudoToMCOpcode(NewOpc) != -1) {
4491 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4492 .add(*Dst)
4493 .add(*Src0)
4494 .addImm(Imm)
4495 .add(*Src2)
4496 .setMIFlags(MI.getFlags());
4497 U.RemoveMIUse = DefMI;
4498 return MIB;
4499 }
4500 }
4501 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4502 if (Src0Literal) {
4503 Imm = Src0->getImm();
4504 DefMI = nullptr;
4505 }
4506 if (pseudoToMCOpcode(NewOpc) != -1 &&
4508 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4509 Src1)) {
4510 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4511 .add(*Dst)
4512 .add(*Src1)
4513 .addImm(Imm)
4514 .add(*Src2)
4515 .setMIFlags(MI.getFlags());
4516 U.RemoveMIUse = DefMI;
4517 return MIB;
4518 }
4519 }
4520 }
4521
4522 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4523 // if VOP3 does not allow a literal operand.
4524 if (Src0Literal && !ST.hasVOP3Literal())
4525 return nullptr;
4526
4527 unsigned NewOpc = getNewFMAInst(ST, Opc);
4528
4529 if (pseudoToMCOpcode(NewOpc) == -1)
4530 return nullptr;
4531
4532 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4533 .add(*Dst)
4534 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4535 .add(*Src0)
4536 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4537 .add(*Src1)
4538 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4539 .add(*Src2)
4540 .addImm(Clamp ? Clamp->getImm() : 0)
4541 .addImm(Omod ? Omod->getImm() : 0)
4542 .setMIFlags(MI.getFlags());
4543 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4544 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4545 return MIB;
4546}
4547
4548// It's not generally safe to move VALU instructions across these since it will
4549// start using the register as a base index rather than directly.
4550// XXX - Why isn't hasSideEffects sufficient for these?
4552 switch (MI.getOpcode()) {
4553 case AMDGPU::S_SET_GPR_IDX_ON:
4554 case AMDGPU::S_SET_GPR_IDX_MODE:
4555 case AMDGPU::S_SET_GPR_IDX_OFF:
4556 return true;
4557 default:
4558 return false;
4559 }
4560}
4561
4563 const MachineBasicBlock *MBB,
4564 const MachineFunction &MF) const {
4565 // Skipping the check for SP writes in the base implementation. The reason it
4566 // was added was apparently due to compile time concerns.
4567 //
4568 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4569 // but is probably avoidable.
4570
4571 // Copied from base implementation.
4572 // Terminators and labels can't be scheduled around.
4573 if (MI.isTerminator() || MI.isPosition())
4574 return true;
4575
4576 // INLINEASM_BR can jump to another block
4577 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4578 return true;
4579
4580 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4581 return true;
4582
4583 // Target-independent instructions do not have an implicit-use of EXEC, even
4584 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4585 // boundaries prevents incorrect movements of such instructions.
4586 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4587 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4588 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4589 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4590 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4592}
4593
4595 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4596 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4597 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4598}
4599
4601 // Instructions that access scratch use FLAT encoding or BUF encodings.
4602 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4603 return false;
4604
4605 // SCRATCH instructions always access scratch.
4606 if (isFLATScratch(MI))
4607 return true;
4608
4609 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4610 // via the aperture.
4611 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4612 return false;
4613
4614 // If there are no memory operands then conservatively assume the flat
4615 // operation may access scratch.
4616 if (MI.memoperands_empty())
4617 return true;
4618
4619 // See if any memory operand specifies an address space that involves scratch.
4620 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4621 unsigned AS = Memop->getAddrSpace();
4622 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4623 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4624 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4625 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4626 }
4627 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4628 });
4629}
4630
4632 assert(isFLAT(MI));
4633
4634 // All flat instructions use the VMEM counter except prefetch.
4635 if (!usesVM_CNT(MI))
4636 return false;
4637
4638 // If there are no memory operands then conservatively assume the flat
4639 // operation may access VMEM.
4640 if (MI.memoperands_empty())
4641 return true;
4642
4643 // See if any memory operand specifies an address space that involves VMEM.
4644 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4645 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4646 // (GDS) address space is not supported by flat operations. Therefore, simply
4647 // return true unless only the LDS address space is found.
4648 for (const MachineMemOperand *Memop : MI.memoperands()) {
4649 unsigned AS = Memop->getAddrSpace();
4651 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4652 return true;
4653 }
4654
4655 return false;
4656}
4657
4659 assert(isFLAT(MI));
4660
4661 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4662 if (!usesLGKM_CNT(MI))
4663 return false;
4664
4665 // If in tgsplit mode then there can be no use of LDS.
4666 if (ST.isTgSplitEnabled())
4667 return false;
4668
4669 // If there are no memory operands then conservatively assume the flat
4670 // operation may access LDS.
4671 if (MI.memoperands_empty())
4672 return true;
4673
4674 // See if any memory operand specifies an address space that involves LDS.
4675 for (const MachineMemOperand *Memop : MI.memoperands()) {
4676 unsigned AS = Memop->getAddrSpace();
4678 return true;
4679 }
4680
4681 return false;
4682}
4683
4685 // Skip the full operand and register alias search modifiesRegister
4686 // does. There's only a handful of instructions that touch this, it's only an
4687 // implicit def, and doesn't alias any other registers.
4688 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4689}
4690
4692 unsigned Opcode = MI.getOpcode();
4693
4694 if (MI.mayStore() && isSMRD(MI))
4695 return true; // scalar store or atomic
4696
4697 // This will terminate the function when other lanes may need to continue.
4698 if (MI.isReturn())
4699 return true;
4700
4701 // These instructions cause shader I/O that may cause hardware lockups
4702 // when executed with an empty EXEC mask.
4703 //
4704 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4705 // EXEC = 0, but checking for that case here seems not worth it
4706 // given the typical code patterns.
4707 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4708 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4709 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
4710 Opcode == AMDGPU::S_SETHALT)
4711 return true;
4712
4713 if (MI.isCall() || MI.isInlineAsm())
4714 return true; // conservative assumption
4715
4716 // Assume that barrier interactions are only intended with active lanes.
4717 if (isBarrier(Opcode))
4718 return true;
4719
4720 // A mode change is a scalar operation that influences vector instructions.
4722 return true;
4723
4724 // These are like SALU instructions in terms of effects, so it's questionable
4725 // whether we should return true for those.
4726 //
4727 // However, executing them with EXEC = 0 causes them to operate on undefined
4728 // data, which we avoid by returning true here.
4729 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4730 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4731 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4732 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4733 return true;
4734
4735 return false;
4736}
4737
4739 const MachineInstr &MI) const {
4740 if (MI.isMetaInstruction())
4741 return false;
4742
4743 // This won't read exec if this is an SGPR->SGPR copy.
4744 if (MI.isCopyLike()) {
4745 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4746 return true;
4747
4748 // Make sure this isn't copying exec as a normal operand
4749 return MI.readsRegister(AMDGPU::EXEC, &RI);
4750 }
4751
4752 // Make a conservative assumption about the callee.
4753 if (MI.isCall())
4754 return true;
4755
4756 // Be conservative with any unhandled generic opcodes.
4757 if (!isTargetSpecificOpcode(MI.getOpcode()))
4758 return true;
4759
4760 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4761}
4762
4763bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4764 switch (Imm.getBitWidth()) {
4765 case 1: // This likely will be a condition code mask.
4766 return true;
4767
4768 case 32:
4769 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4770 ST.hasInv2PiInlineImm());
4771 case 64:
4772 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4773 ST.hasInv2PiInlineImm());
4774 case 16:
4775 return ST.has16BitInsts() &&
4776 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4777 ST.hasInv2PiInlineImm());
4778 default:
4779 llvm_unreachable("invalid bitwidth");
4780 }
4781}
4782
4784 APInt IntImm = Imm.bitcastToAPInt();
4785 int64_t IntImmVal = IntImm.getSExtValue();
4786 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4787 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4788 default:
4789 llvm_unreachable("invalid fltSemantics");
4792 return isInlineConstant(IntImm);
4794 return ST.has16BitInsts() &&
4795 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4797 return ST.has16BitInsts() &&
4798 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4799 }
4800}
4801
4802bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4803 // MachineOperand provides no way to tell the true operand size, since it only
4804 // records a 64-bit value. We need to know the size to determine if a 32-bit
4805 // floating point immediate bit pattern is legal for an integer immediate. It
4806 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4807 switch (OperandType) {
4817 int32_t Trunc = static_cast<int32_t>(Imm);
4818 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4819 }
4827 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4830 // We would expect inline immediates to not be concerned with an integer/fp
4831 // distinction. However, in the case of 16-bit integer operations, the
4832 // "floating point" values appear to not work. It seems read the low 16-bits
4833 // of 32-bit immediates, which happens to always work for the integer
4834 // values.
4835 //
4836 // See llvm bugzilla 46302.
4837 //
4838 // TODO: Theoretically we could use op-sel to use the high bits of the
4839 // 32-bit FP values.
4848 return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
4853 return false;
4856 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4857 // A few special case instructions have 16-bit operands on subtargets
4858 // where 16-bit instructions are not legal.
4859 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4860 // constants in these cases
4861 int16_t Trunc = static_cast<int16_t>(Imm);
4862 return ST.has16BitInsts() &&
4863 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4864 }
4865
4866 return false;
4867 }
4870 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4871 int16_t Trunc = static_cast<int16_t>(Imm);
4872 return ST.has16BitInsts() &&
4873 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4874 }
4875 return false;
4876 }
4880 return false;
4882 return isLegalAV64PseudoImm(Imm);
4885 // Always embedded in the instruction for free.
4886 return true;
4896 // Just ignore anything else.
4897 return true;
4898 default:
4899 llvm_unreachable("invalid operand type");
4900 }
4901}
4902
4903static bool compareMachineOp(const MachineOperand &Op0,
4904 const MachineOperand &Op1) {
4905 if (Op0.getType() != Op1.getType())
4906 return false;
4907
4908 switch (Op0.getType()) {
4910 return Op0.getReg() == Op1.getReg();
4912 return Op0.getImm() == Op1.getImm();
4913 default:
4914 llvm_unreachable("Didn't expect to be comparing these operand types");
4915 }
4916}
4917
4919 const MCOperandInfo &OpInfo) const {
4920 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4921 return true;
4922
4923 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4924 return false;
4925
4926 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4927 return true;
4928
4929 return ST.hasVOP3Literal();
4930}
4931
4932bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4933 int64_t ImmVal) const {
4934 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4935 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4936 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4937 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4938 AMDGPU::OpName::src2))
4939 return false;
4940 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4941 }
4942
4943 return isLiteralOperandLegal(InstDesc, OpInfo);
4944}
4945
4946bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4947 const MachineOperand &MO) const {
4948 if (MO.isImm())
4949 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4950
4951 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4952 "unexpected imm-like operand kind");
4953 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4954 return isLiteralOperandLegal(InstDesc, OpInfo);
4955}
4956
4958 // 2 32-bit inline constants packed into one.
4959 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4960 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4961}
4962
4963bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4964 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4965 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4966 return false;
4967
4968 int Op32 = AMDGPU::getVOPe32(Opcode);
4969 if (Op32 == -1)
4970 return false;
4971
4972 return pseudoToMCOpcode(Op32) != -1;
4973}
4974
4975bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4976 // The src0_modifier operand is present on all instructions
4977 // that have modifiers.
4978
4979 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4980}
4981
4983 AMDGPU::OpName OpName) const {
4984 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4985 return Mods && Mods->getImm();
4986}
4987
4989 return any_of(ModifierOpNames,
4990 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4991}
4992
4994 const MachineRegisterInfo &MRI) const {
4995 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4996 // Can't shrink instruction with three operands.
4997 if (Src2) {
4998 switch (MI.getOpcode()) {
4999 default: return false;
5000
5001 case AMDGPU::V_ADDC_U32_e64:
5002 case AMDGPU::V_SUBB_U32_e64:
5003 case AMDGPU::V_SUBBREV_U32_e64: {
5004 const MachineOperand *Src1
5005 = getNamedOperand(MI, AMDGPU::OpName::src1);
5006 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
5007 return false;
5008 // Additional verification is needed for sdst/src2.
5009 return true;
5010 }
5011 case AMDGPU::V_MAC_F16_e64:
5012 case AMDGPU::V_MAC_F32_e64:
5013 case AMDGPU::V_MAC_LEGACY_F32_e64:
5014 case AMDGPU::V_FMAC_F16_e64:
5015 case AMDGPU::V_FMAC_F16_t16_e64:
5016 case AMDGPU::V_FMAC_F16_fake16_e64:
5017 case AMDGPU::V_FMAC_F32_e64:
5018 case AMDGPU::V_FMAC_F64_e64:
5019 case AMDGPU::V_FMAC_LEGACY_F32_e64:
5020 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
5021 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
5022 return false;
5023 break;
5024
5025 case AMDGPU::V_CNDMASK_B32_e64:
5026 break;
5027 }
5028 }
5029
5030 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
5031 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
5032 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
5033 return false;
5034
5035 // We don't need to check src0, all input types are legal, so just make sure
5036 // src0 isn't using any modifiers.
5037 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
5038 return false;
5039
5040 // Can it be shrunk to a valid 32 bit opcode?
5041 if (!hasVALU32BitEncoding(MI.getOpcode()))
5042 return false;
5043
5044 // Check output modifiers
5045 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
5046 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
5047 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
5048 // TODO: Can we avoid checking bound_ctrl/fi here?
5049 // They are only used by permlane*_swap special case.
5050 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
5051 !hasModifiersSet(MI, AMDGPU::OpName::fi);
5052}
5053
5054// Set VCC operand with all flags from \p Orig, except for setting it as
5055// implicit.
5057 const MachineOperand &Orig) {
5058
5059 for (MachineOperand &Use : MI.implicit_operands()) {
5060 if (Use.isUse() &&
5061 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
5062 Use.setIsUndef(Orig.isUndef());
5063 Use.setIsKill(Orig.isKill());
5064 return;
5065 }
5066 }
5067}
5068
5070 unsigned Op32) const {
5071 MachineBasicBlock *MBB = MI.getParent();
5072
5073 const MCInstrDesc &Op32Desc = get(Op32);
5074 MachineInstrBuilder Inst32 =
5075 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
5076 .setMIFlags(MI.getFlags());
5077
5078 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
5079 // For VOPC instructions, this is replaced by an implicit def of vcc.
5080
5081 // We assume the defs of the shrunk opcode are in the same order, and the
5082 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
5083 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
5084 Inst32.add(MI.getOperand(I));
5085
5086 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
5087
5088 int Idx = MI.getNumExplicitDefs();
5089 for (const MachineOperand &Use : MI.explicit_uses()) {
5090 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
5092 continue;
5093
5094 if (&Use == Src2) {
5095 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
5096 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
5097 // replaced with an implicit read of vcc or vcc_lo. The implicit read
5098 // of vcc was already added during the initial BuildMI, but we
5099 // 1) may need to change vcc to vcc_lo to preserve the original register
5100 // 2) have to preserve the original flags.
5101 copyFlagsToImplicitVCC(*Inst32, *Src2);
5102 continue;
5103 }
5104 }
5105
5106 Inst32.add(Use);
5107 }
5108
5109 // FIXME: Losing implicit operands
5110 fixImplicitOperands(*Inst32);
5111 return Inst32;
5112}
5113
5115 // Null is free
5116 Register Reg = RegOp.getReg();
5117 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
5118 return false;
5119
5120 // SGPRs use the constant bus
5121
5122 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5123 // physical register operands should also count, except for exec.
5124 if (RegOp.isImplicit())
5125 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5126
5127 // SGPRs use the constant bus
5128 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5129 AMDGPU::SReg_64RegClass.contains(Reg);
5130}
5131
5133 const MachineRegisterInfo &MRI) const {
5134 Register Reg = RegOp.getReg();
5135 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5136 : physRegUsesConstantBus(RegOp);
5137}
5138
5140 const MachineOperand &MO,
5141 const MCOperandInfo &OpInfo) const {
5142 // Literal constants use the constant bus.
5143 if (!MO.isReg())
5144 return !isInlineConstant(MO, OpInfo);
5145
5146 Register Reg = MO.getReg();
5147 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5149}
5150
5152 for (const MachineOperand &MO : MI.implicit_operands()) {
5153 // We only care about reads.
5154 if (MO.isDef())
5155 continue;
5156
5157 switch (MO.getReg()) {
5158 case AMDGPU::VCC:
5159 case AMDGPU::VCC_LO:
5160 case AMDGPU::VCC_HI:
5161 case AMDGPU::M0:
5162 case AMDGPU::FLAT_SCR:
5163 return MO.getReg();
5164
5165 default:
5166 break;
5167 }
5168 }
5169
5170 return Register();
5171}
5172
5173static bool shouldReadExec(const MachineInstr &MI) {
5174 if (SIInstrInfo::isVALU(MI)) {
5175 switch (MI.getOpcode()) {
5176 case AMDGPU::V_READLANE_B32:
5177 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5178 case AMDGPU::V_WRITELANE_B32:
5179 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5180 return false;
5181 }
5182
5183 return true;
5184 }
5185
5186 if (MI.isPreISelOpcode() ||
5187 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5190 return false;
5191
5192 return true;
5193}
5194
5195static bool isRegOrFI(const MachineOperand &MO) {
5196 return MO.isReg() || MO.isFI();
5197}
5198
5199static bool isSubRegOf(const SIRegisterInfo &TRI,
5200 const MachineOperand &SuperVec,
5201 const MachineOperand &SubReg) {
5202 if (SubReg.getReg().isPhysical())
5203 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5204
5205 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5206 SubReg.getReg() == SuperVec.getReg();
5207}
5208
5209// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5210bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5211 const MachineRegisterInfo &MRI,
5212 StringRef &ErrInfo) const {
5213 Register DstReg = MI.getOperand(0).getReg();
5214 Register SrcReg = MI.getOperand(1).getReg();
5215 // This is a check for copy from vector register to SGPR
5216 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5217 ErrInfo = "illegal copy from vector register to SGPR";
5218 return false;
5219 }
5220 return true;
5221}
5222
5224 StringRef &ErrInfo) const {
5225 uint32_t Opcode = MI.getOpcode();
5226 const MachineFunction *MF = MI.getMF();
5227 const MachineRegisterInfo &MRI = MF->getRegInfo();
5228
5229 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5230 // Find a better property to recognize the point where instruction selection
5231 // is just done.
5232 // We can only enforce this check after SIFixSGPRCopies pass so that the
5233 // illegal copies are legalized and thereafter we don't expect a pass
5234 // inserting similar copies.
5235 if (!MRI.isSSA() && MI.isCopy())
5236 return verifyCopy(MI, MRI, ErrInfo);
5237
5238 if (SIInstrInfo::isGenericOpcode(Opcode))
5239 return true;
5240
5241 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5242 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5243 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5244 int Src3Idx = -1;
5245 if (Src0Idx == -1) {
5246 // VOPD V_DUAL_* instructions use different operand names.
5247 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5248 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5249 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5250 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5251 }
5252
5253 // Make sure the number of operands is correct.
5254 const MCInstrDesc &Desc = get(Opcode);
5255 if (!Desc.isVariadic() &&
5256 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5257 ErrInfo = "Instruction has wrong number of operands.";
5258 return false;
5259 }
5260
5261 if (MI.isInlineAsm()) {
5262 // Verify register classes for inlineasm constraints.
5263 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5264 I != E; ++I) {
5265 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5266 if (!RC)
5267 continue;
5268
5269 const MachineOperand &Op = MI.getOperand(I);
5270 if (!Op.isReg())
5271 continue;
5272
5273 Register Reg = Op.getReg();
5274 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5275 ErrInfo = "inlineasm operand has incorrect register class.";
5276 return false;
5277 }
5278 }
5279
5280 return true;
5281 }
5282
5283 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5284 ErrInfo = "missing memory operand from image instruction.";
5285 return false;
5286 }
5287
5288 // Make sure the register classes are correct.
5289 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5290 const MachineOperand &MO = MI.getOperand(i);
5291 if (MO.isFPImm()) {
5292 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5293 "all fp values to integers.";
5294 return false;
5295 }
5296
5297 const MCOperandInfo &OpInfo = Desc.operands()[i];
5298 int16_t RegClass = getOpRegClassID(OpInfo);
5299
5300 switch (OpInfo.OperandType) {
5302 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5303 ErrInfo = "Illegal immediate value for operand.";
5304 return false;
5305 }
5306 break;
5318 break;
5320 break;
5321 break;
5335 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5336 ErrInfo = "Illegal immediate value for operand.";
5337 return false;
5338 }
5339 break;
5340 }
5345 if (ST.has64BitLiterals() && Desc.getSize() != 4 && MO.isImm() &&
5346 !isInlineConstant(MI, i) &&
5348 OpInfo.OperandType ==
5350 ErrInfo = "illegal 64-bit immediate value for operand.";
5351 return false;
5352 }
5353 break;
5356 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5357 ErrInfo = "Expected inline constant for operand.";
5358 return false;
5359 }
5360 break;
5363 break;
5368 // Check if this operand is an immediate.
5369 // FrameIndex operands will be replaced by immediates, so they are
5370 // allowed.
5371 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5372 ErrInfo = "Expected immediate, but got non-immediate";
5373 return false;
5374 }
5375 break;
5379 break;
5380 default:
5381 if (OpInfo.isGenericType())
5382 continue;
5383 break;
5384 }
5385
5386 if (!MO.isReg())
5387 continue;
5388 Register Reg = MO.getReg();
5389 if (!Reg)
5390 continue;
5391
5392 // FIXME: Ideally we would have separate instruction definitions with the
5393 // aligned register constraint.
5394 // FIXME: We do not verify inline asm operands, but custom inline asm
5395 // verification is broken anyway
5396 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5397 Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
5398 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5399 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5400 if (const TargetRegisterClass *SubRC =
5401 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5402 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5403 if (RC)
5404 RC = SubRC;
5405 }
5406 }
5407
5408 // Check that this is the aligned version of the class.
5409 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5410 ErrInfo = "Subtarget requires even aligned vector registers";
5411 return false;
5412 }
5413 }
5414
5415 if (RegClass != -1) {
5416 if (Reg.isVirtual())
5417 continue;
5418
5419 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5420 if (!RC->contains(Reg)) {
5421 ErrInfo = "Operand has incorrect register class.";
5422 return false;
5423 }
5424 }
5425 }
5426
5427 // Verify SDWA
5428 if (isSDWA(MI)) {
5429 if (!ST.hasSDWA()) {
5430 ErrInfo = "SDWA is not supported on this target";
5431 return false;
5432 }
5433
5434 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5435 AMDGPU::OpName::dst_sel}) {
5436 const MachineOperand *MO = getNamedOperand(MI, Op);
5437 if (!MO)
5438 continue;
5439 int64_t Imm = MO->getImm();
5440 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5441 ErrInfo = "Invalid SDWA selection";
5442 return false;
5443 }
5444 }
5445
5446 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5447
5448 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5449 if (OpIdx == -1)
5450 continue;
5451 const MachineOperand &MO = MI.getOperand(OpIdx);
5452
5453 if (!ST.hasSDWAScalar()) {
5454 // Only VGPRS on VI
5455 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5456 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5457 return false;
5458 }
5459 } else {
5460 // No immediates on GFX9
5461 if (!MO.isReg()) {
5462 ErrInfo =
5463 "Only reg allowed as operands in SDWA instructions on GFX9+";
5464 return false;
5465 }
5466 }
5467 }
5468
5469 if (!ST.hasSDWAOmod()) {
5470 // No omod allowed on VI
5471 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5472 if (OMod != nullptr &&
5473 (!OMod->isImm() || OMod->getImm() != 0)) {
5474 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5475 return false;
5476 }
5477 }
5478
5479 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5480 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5481 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5482 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5483 const MachineOperand *Src0ModsMO =
5484 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5485 unsigned Mods = Src0ModsMO->getImm();
5486 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5487 Mods & SISrcMods::SEXT) {
5488 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5489 return false;
5490 }
5491 }
5492
5493 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5494 if (isVOPC(BasicOpcode)) {
5495 if (!ST.hasSDWASdst() && DstIdx != -1) {
5496 // Only vcc allowed as dst on VI for VOPC
5497 const MachineOperand &Dst = MI.getOperand(DstIdx);
5498 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5499 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5500 return false;
5501 }
5502 } else if (!ST.hasSDWAOutModsVOPC()) {
5503 // No clamp allowed on GFX9 for VOPC
5504 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5505 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5506 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5507 return false;
5508 }
5509
5510 // No omod allowed on GFX9 for VOPC
5511 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5512 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5513 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5514 return false;
5515 }
5516 }
5517 }
5518
5519 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5520 if (DstUnused && DstUnused->isImm() &&
5521 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5522 const MachineOperand &Dst = MI.getOperand(DstIdx);
5523 if (!Dst.isReg() || !Dst.isTied()) {
5524 ErrInfo = "Dst register should have tied register";
5525 return false;
5526 }
5527
5528 const MachineOperand &TiedMO =
5529 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5530 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5531 ErrInfo =
5532 "Dst register should be tied to implicit use of preserved register";
5533 return false;
5534 }
5535 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5536 ErrInfo = "Dst register should use same physical register as preserved";
5537 return false;
5538 }
5539 }
5540 }
5541
5542 if (isDPP(MI) && !ST.hasDPPSrc1SGPR() && Src1Idx != -1) {
5543 const MachineOperand &Src1MO = MI.getOperand(Src1Idx);
5544 if (Src1MO.isReg() && RI.isSGPRReg(MRI, Src1MO.getReg())) {
5545 ErrInfo = "DPP src1 cannot be SGPR on this subtarget";
5546 return false;
5547 }
5548 }
5549
5550 // Verify MIMG / VIMAGE / VSAMPLE
5551 if (isImage(Opcode) && !MI.mayStore()) {
5552 // Ensure that the return type used is large enough for all the options
5553 // being used TFE/LWE require an extra result register.
5554 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5555 if (DMask) {
5556 uint64_t DMaskImm = DMask->getImm();
5557 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5558 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5559 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5560 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5561
5562 // Adjust for packed 16 bit values
5563 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5564 RegCount = divideCeil(RegCount, 2);
5565
5566 // Adjust if using LWE or TFE
5567 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5568 RegCount += 1;
5569
5570 const uint32_t DstIdx =
5571 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5572 const MachineOperand &Dst = MI.getOperand(DstIdx);
5573 if (Dst.isReg()) {
5574 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5575 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5576 if (RegCount > DstSize) {
5577 ErrInfo = "Image instruction returns too many registers for dst "
5578 "register class";
5579 return false;
5580 }
5581 }
5582 }
5583 }
5584
5585 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5586 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5587 unsigned ConstantBusCount = 0;
5588 bool UsesLiteral = false;
5589 const MachineOperand *LiteralVal = nullptr;
5590
5591 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5592 if (ImmIdx != -1) {
5593 ++ConstantBusCount;
5594 UsesLiteral = true;
5595 LiteralVal = &MI.getOperand(ImmIdx);
5596 }
5597
5598 SmallVector<Register, 2> SGPRsUsed;
5599 Register SGPRUsed;
5600
5601 // Only look at the true operands. Only a real operand can use the constant
5602 // bus, and we don't want to check pseudo-operands like the source modifier
5603 // flags.
5604 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5605 if (OpIdx == -1)
5606 continue;
5607 const MachineOperand &MO = MI.getOperand(OpIdx);
5608 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5609 if (MO.isReg()) {
5610 SGPRUsed = MO.getReg();
5611 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5612 ++ConstantBusCount;
5613 SGPRsUsed.push_back(SGPRUsed);
5614 }
5615 } else if (!MO.isFI()) { // Treat FI like a register.
5616 if (!UsesLiteral) {
5617 ++ConstantBusCount;
5618 UsesLiteral = true;
5619 LiteralVal = &MO;
5620 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5621 assert(isVOP2(MI) || isVOP3(MI));
5622 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5623 return false;
5624 }
5625 }
5626 }
5627 }
5628
5629 SGPRUsed = findImplicitSGPRRead(MI);
5630 if (SGPRUsed) {
5631 // Implicit uses may safely overlap true operands
5632 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5633 return !RI.regsOverlap(SGPRUsed, SGPR);
5634 })) {
5635 ++ConstantBusCount;
5636 SGPRsUsed.push_back(SGPRUsed);
5637 }
5638 }
5639
5640 // v_writelane_b32 is an exception from constant bus restriction:
5641 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5642 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5643 Opcode != AMDGPU::V_WRITELANE_B32) {
5644 ErrInfo = "VOP* instruction violates constant bus restriction";
5645 return false;
5646 }
5647
5648 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5649 ErrInfo = "VOP3 instruction uses literal";
5650 return false;
5651 }
5652 }
5653
5654 // Special case for writelane - this can break the multiple constant bus rule,
5655 // but still can't use more than one SGPR register
5656 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5657 unsigned SGPRCount = 0;
5658 Register SGPRUsed;
5659
5660 for (int OpIdx : {Src0Idx, Src1Idx}) {
5661 if (OpIdx == -1)
5662 break;
5663
5664 const MachineOperand &MO = MI.getOperand(OpIdx);
5665
5666 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5667 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5668 if (MO.getReg() != SGPRUsed)
5669 ++SGPRCount;
5670 SGPRUsed = MO.getReg();
5671 }
5672 }
5673 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5674 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5675 return false;
5676 }
5677 }
5678 }
5679
5680 // Verify misc. restrictions on specific instructions.
5681 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5682 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5683 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5684 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5685 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5686 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5687 if (!compareMachineOp(Src0, Src1) &&
5688 !compareMachineOp(Src0, Src2)) {
5689 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5690 return false;
5691 }
5692 }
5693 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5694 SISrcMods::ABS) ||
5695 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5696 SISrcMods::ABS) ||
5697 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5698 SISrcMods::ABS)) {
5699 ErrInfo = "ABS not allowed in VOP3B instructions";
5700 return false;
5701 }
5702 }
5703
5704 if (isSOP2(MI) || isSOPC(MI)) {
5705 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5706 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5707
5708 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5709 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5710 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5711 !Src0.isIdenticalTo(Src1)) {
5712 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5713 return false;
5714 }
5715 }
5716
5717 if (isSOPK(MI)) {
5718 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5719 if (Desc.isBranch()) {
5720 if (!Op->isMBB()) {
5721 ErrInfo = "invalid branch target for SOPK instruction";
5722 return false;
5723 }
5724 } else {
5725 uint64_t Imm = Op->getImm();
5726 if (sopkIsZext(Opcode)) {
5727 if (!isUInt<16>(Imm)) {
5728 ErrInfo = "invalid immediate for SOPK instruction";
5729 return false;
5730 }
5731 } else {
5732 if (!isInt<16>(Imm)) {
5733 ErrInfo = "invalid immediate for SOPK instruction";
5734 return false;
5735 }
5736 }
5737 }
5738 }
5739
5740 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5741 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5742 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5743 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5744 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5745 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5746
5747 const unsigned StaticNumOps =
5748 Desc.getNumOperands() + Desc.implicit_uses().size();
5749 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5750
5751 // Require additional implicit operands. This allows a fixup done by the
5752 // post RA scheduler where the main implicit operand is killed and
5753 // implicit-defs are added for sub-registers that remain live after this
5754 // instruction.
5755 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5756 ErrInfo = "missing implicit register operands";
5757 return false;
5758 }
5759
5760 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5761 if (IsDst) {
5762 if (!Dst->isUse()) {
5763 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5764 return false;
5765 }
5766
5767 unsigned UseOpIdx;
5768 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5769 UseOpIdx != StaticNumOps + 1) {
5770 ErrInfo = "movrel implicit operands should be tied";
5771 return false;
5772 }
5773 }
5774
5775 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5776 const MachineOperand &ImpUse
5777 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5778 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5779 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5780 ErrInfo = "src0 should be subreg of implicit vector use";
5781 return false;
5782 }
5783 }
5784
5785 // Make sure we aren't losing exec uses in the td files. This mostly requires
5786 // being careful when using let Uses to try to add other use registers.
5787 if (shouldReadExec(MI)) {
5788 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5789 ErrInfo = "VALU instruction does not implicitly read exec mask";
5790 return false;
5791 }
5792 }
5793
5794 if (isSMRD(MI)) {
5795 if (MI.mayStore() &&
5796 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5797 // The register offset form of scalar stores may only use m0 as the
5798 // soffset register.
5799 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5800 if (Soff && Soff->getReg() != AMDGPU::M0) {
5801 ErrInfo = "scalar stores must use m0 as offset register";
5802 return false;
5803 }
5804 }
5805 }
5806
5807 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5808 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5809 if (Offset->getImm() != 0) {
5810 ErrInfo = "subtarget does not support offsets in flat instructions";
5811 return false;
5812 }
5813 }
5814
5815 if (isDS(MI) && !ST.hasGDS()) {
5816 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5817 if (GDSOp && GDSOp->getImm() != 0) {
5818 ErrInfo = "GDS is not supported on this subtarget";
5819 return false;
5820 }
5821 }
5822
5823 if (isImage(MI)) {
5824 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5825 if (DimOp) {
5826 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5827 AMDGPU::OpName::vaddr0);
5828 AMDGPU::OpName RSrcOpName =
5829 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5830 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5831 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5832 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5833 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5834 const AMDGPU::MIMGDimInfo *Dim =
5836
5837 if (!Dim) {
5838 ErrInfo = "dim is out of range";
5839 return false;
5840 }
5841
5842 bool IsA16 = false;
5843 if (ST.hasR128A16()) {
5844 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5845 IsA16 = R128A16->getImm() != 0;
5846 } else if (ST.hasA16()) {
5847 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5848 IsA16 = A16->getImm() != 0;
5849 }
5850
5851 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5852
5853 unsigned AddrWords =
5854 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5855
5856 unsigned VAddrWords;
5857 if (IsNSA) {
5858 VAddrWords = RsrcIdx - VAddr0Idx;
5859 if (ST.hasPartialNSAEncoding() &&
5860 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5861 unsigned LastVAddrIdx = RsrcIdx - 1;
5862 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5863 }
5864 } else {
5865 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5866 if (AddrWords > 12)
5867 AddrWords = 16;
5868 }
5869
5870 if (VAddrWords != AddrWords) {
5871 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5872 << " but got " << VAddrWords << "\n");
5873 ErrInfo = "bad vaddr size";
5874 return false;
5875 }
5876 }
5877 }
5878
5879 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5880 if (DppCt) {
5881 using namespace AMDGPU::DPP;
5882
5883 unsigned DC = DppCt->getImm();
5884 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5885 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5886 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5887 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5888 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5889 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5890 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5891 ErrInfo = "Invalid dpp_ctrl value";
5892 return false;
5893 }
5894 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5895 !ST.hasDPPWavefrontShifts()) {
5896 ErrInfo = "Invalid dpp_ctrl value: "
5897 "wavefront shifts are not supported on GFX10+";
5898 return false;
5899 }
5900 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5901 !ST.hasDPPBroadcasts()) {
5902 ErrInfo = "Invalid dpp_ctrl value: "
5903 "broadcasts are not supported on GFX10+";
5904 return false;
5905 }
5906 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5907 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5908 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5909 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5910 !ST.hasGFX90AInsts()) {
5911 ErrInfo = "Invalid dpp_ctrl value: "
5912 "row_newbroadcast/row_share is not supported before "
5913 "GFX90A/GFX10";
5914 return false;
5915 }
5916 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5917 ErrInfo = "Invalid dpp_ctrl value: "
5918 "row_share and row_xmask are not supported before GFX10";
5919 return false;
5920 }
5921 }
5922
5923 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5925 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5926 ErrInfo = "Invalid dpp_ctrl value: "
5927 "DP ALU dpp only support row_newbcast";
5928 return false;
5929 }
5930 }
5931
5932 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5933 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5934 AMDGPU::OpName DataName =
5935 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5936 const MachineOperand *Data = getNamedOperand(MI, DataName);
5937 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5938 if (Data && !Data->isReg())
5939 Data = nullptr;
5940
5941 if (ST.hasGFX90AInsts()) {
5942 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5943 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5944 ErrInfo = "Invalid register class: "
5945 "vdata and vdst should be both VGPR or AGPR";
5946 return false;
5947 }
5948 if (Data && Data2 &&
5949 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5950 ErrInfo = "Invalid register class: "
5951 "both data operands should be VGPR or AGPR";
5952 return false;
5953 }
5954 } else {
5955 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5956 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5957 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5958 ErrInfo = "Invalid register class: "
5959 "agpr loads and stores not supported on this GPU";
5960 return false;
5961 }
5962 }
5963 }
5964
5965 if (ST.needsAlignedVGPRs()) {
5966 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5968 if (!Op)
5969 return true;
5970 Register Reg = Op->getReg();
5971 if (Reg.isPhysical())
5972 return !(RI.getHWRegIndex(Reg) & 1);
5973 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5974 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5975 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5976 };
5977
5978 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5979 Opcode == AMDGPU::DS_GWS_BARRIER) {
5980
5981 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5982 ErrInfo = "Subtarget requires even aligned vector registers "
5983 "for DS_GWS instructions";
5984 return false;
5985 }
5986 }
5987
5988 if (isMIMG(MI)) {
5989 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5990 ErrInfo = "Subtarget requires even aligned vector registers "
5991 "for vaddr operand of image instructions";
5992 return false;
5993 }
5994 }
5995 }
5996
5997 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5998 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5999 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
6000 ErrInfo = "Invalid register class: "
6001 "v_accvgpr_write with an SGPR is not supported on this GPU";
6002 return false;
6003 }
6004 }
6005
6006 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
6007 const MachineOperand &SrcOp = MI.getOperand(1);
6008 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
6009 ErrInfo = "pseudo expects only physical SGPRs";
6010 return false;
6011 }
6012 }
6013
6014 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6015 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
6016 if (!ST.hasScaleOffset()) {
6017 ErrInfo = "Subtarget does not support offset scaling";
6018 return false;
6019 }
6020 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
6021 ErrInfo = "Instruction does not support offset scaling";
6022 return false;
6023 }
6024 }
6025 }
6026
6027 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32or64BitOperand for more
6028 // information.
6030 for (unsigned I = 0; I < 3; ++I) {
6032 return false;
6033 }
6034 }
6035
6036 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
6037 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
6038 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
6039 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
6040 &AMDGPU::SReg_64RegClass) ||
6041 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
6042 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
6043 return false;
6044 }
6045 }
6046
6047 return true;
6048}
6049
6051 if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
6052 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6053 return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg())
6054 ? AMDGPU::COPY
6055 : AMDGPU::V_MOV_B32_e32;
6056 }
6057 return getVALUOp(MI.getOpcode());
6058}
6059
6060// It is more readable to list mapped opcodes on the same line.
6061// clang-format off
6062
6063unsigned SIInstrInfo::getVALUOp(unsigned Opc) const {
6064 switch (Opc) {
6065 default: return AMDGPU::INSTRUCTION_LIST_END;
6066 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
6067 case AMDGPU::COPY: return AMDGPU::COPY;
6068 case AMDGPU::PHI: return AMDGPU::PHI;
6069 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
6070 case AMDGPU::WQM: return AMDGPU::WQM;
6071 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
6072 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
6073 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
6074 case AMDGPU::S_ADD_I32:
6075 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
6076 case AMDGPU::S_ADDC_U32:
6077 return AMDGPU::V_ADDC_U32_e32;
6078 case AMDGPU::S_SUB_I32:
6079 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
6080 // FIXME: These are not consistently handled, and selected when the carry is
6081 // used.
6082 case AMDGPU::S_ADD_U32:
6083 return AMDGPU::V_ADD_CO_U32_e32;
6084 case AMDGPU::S_SUB_U32:
6085 return AMDGPU::V_SUB_CO_U32_e32;
6086 case AMDGPU::S_ADD_U64_PSEUDO:
6087 return AMDGPU::V_ADD_U64_PSEUDO;
6088 case AMDGPU::S_SUB_U64_PSEUDO:
6089 return AMDGPU::V_SUB_U64_PSEUDO;
6090 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
6091 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
6092 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
6093 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
6094 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
6095 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
6096 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
6097 case AMDGPU::S_XNOR_B32:
6098 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
6099 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
6100 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
6101 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
6102 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
6103 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
6104 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
6105 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
6106 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
6107 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
6108 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
6109 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
6110 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
6111 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
6112 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
6113 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
6114 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
6115 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
6116 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
6117 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
6118 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
6119 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
6120 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
6121 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
6122 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
6123 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
6124 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
6125 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
6126 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
6127 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
6128 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
6129 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
6130 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
6131 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
6132 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
6133 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
6134 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
6135 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
6136 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
6137 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6138 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6139 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6140 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6141 case AMDGPU::S_CVT_F32_F16:
6142 case AMDGPU::S_CVT_HI_F32_F16:
6143 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6144 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6145 case AMDGPU::S_CVT_F16_F32:
6146 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6147 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6148 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6149 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6150 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6151 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6152 case AMDGPU::S_CEIL_F16:
6153 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6154 : AMDGPU::V_CEIL_F16_fake16_e64;
6155 case AMDGPU::S_FLOOR_F16:
6156 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6157 : AMDGPU::V_FLOOR_F16_fake16_e64;
6158 case AMDGPU::S_TRUNC_F16:
6159 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6160 : AMDGPU::V_TRUNC_F16_fake16_e64;
6161 case AMDGPU::S_RNDNE_F16:
6162 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6163 : AMDGPU::V_RNDNE_F16_fake16_e64;
6164 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6165 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6166 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6167 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6168 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6169 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6170 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6171 case AMDGPU::S_ADD_F16:
6172 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6173 : AMDGPU::V_ADD_F16_fake16_e64;
6174 case AMDGPU::S_SUB_F16:
6175 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6176 : AMDGPU::V_SUB_F16_fake16_e64;
6177 case AMDGPU::S_MIN_F16:
6178 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6179 : AMDGPU::V_MIN_F16_fake16_e64;
6180 case AMDGPU::S_MAX_F16:
6181 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6182 : AMDGPU::V_MAX_F16_fake16_e64;
6183 case AMDGPU::S_MINIMUM_F16:
6184 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6185 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6186 case AMDGPU::S_MAXIMUM_F16:
6187 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6188 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6189 case AMDGPU::S_MUL_F16:
6190 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6191 : AMDGPU::V_MUL_F16_fake16_e64;
6192 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6193 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6194 case AMDGPU::S_FMAC_F16:
6195 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6196 : AMDGPU::V_FMAC_F16_fake16_e64;
6197 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6198 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6199 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6200 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6201 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6202 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6203 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6204 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6205 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6206 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6207 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6208 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6209 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6210 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6211 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6212 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6213 case AMDGPU::S_CMP_LT_F16:
6214 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6215 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6216 case AMDGPU::S_CMP_EQ_F16:
6217 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6218 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6219 case AMDGPU::S_CMP_LE_F16:
6220 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6221 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6222 case AMDGPU::S_CMP_GT_F16:
6223 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6224 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6225 case AMDGPU::S_CMP_LG_F16:
6226 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6227 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6228 case AMDGPU::S_CMP_GE_F16:
6229 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6230 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6231 case AMDGPU::S_CMP_O_F16:
6232 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6233 : AMDGPU::V_CMP_O_F16_fake16_e64;
6234 case AMDGPU::S_CMP_U_F16:
6235 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6236 : AMDGPU::V_CMP_U_F16_fake16_e64;
6237 case AMDGPU::S_CMP_NGE_F16:
6238 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6239 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6240 case AMDGPU::S_CMP_NLG_F16:
6241 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6242 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6243 case AMDGPU::S_CMP_NGT_F16:
6244 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6245 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6246 case AMDGPU::S_CMP_NLE_F16:
6247 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6248 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6249 case AMDGPU::S_CMP_NEQ_F16:
6250 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6251 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6252 case AMDGPU::S_CMP_NLT_F16:
6253 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6254 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6255 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6256 case AMDGPU::V_S_EXP_F16_e64:
6257 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6258 : AMDGPU::V_EXP_F16_fake16_e64;
6259 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6260 case AMDGPU::V_S_LOG_F16_e64:
6261 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6262 : AMDGPU::V_LOG_F16_fake16_e64;
6263 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6264 case AMDGPU::V_S_RCP_F16_e64:
6265 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6266 : AMDGPU::V_RCP_F16_fake16_e64;
6267 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6268 case AMDGPU::V_S_RSQ_F16_e64:
6269 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6270 : AMDGPU::V_RSQ_F16_fake16_e64;
6271 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6272 case AMDGPU::V_S_SQRT_F16_e64:
6273 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6274 : AMDGPU::V_SQRT_F16_fake16_e64;
6275 }
6277 "Unexpected scalar opcode without corresponding vector one!");
6278}
6279
6280// clang-format on
6281
6285 const DebugLoc &DL, Register Reg,
6286 bool IsSCCLive,
6287 SlotIndexes *Indexes) const {
6288 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6289 const SIInstrInfo *TII = ST.getInstrInfo();
6291 if (IsSCCLive) {
6292 // Insert two move instructions, one to save the original value of EXEC and
6293 // the other to turn on all bits in EXEC. This is required as we can't use
6294 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6295 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6297 auto FlipExecMI =
6298 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6299 if (Indexes) {
6300 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6301 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6302 }
6303 } else {
6304 auto SaveExec =
6305 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6306 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6307 if (Indexes)
6308 Indexes->insertMachineInstrInMaps(*SaveExec);
6309 }
6310}
6311
6314 const DebugLoc &DL, Register Reg,
6315 SlotIndexes *Indexes) const {
6317 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6318 .addReg(Reg, RegState::Kill);
6319 if (Indexes)
6320 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6321}
6322
6326 "Not a whole wave func");
6327 MachineBasicBlock &MBB = *MF.begin();
6328 for (MachineInstr &MI : MBB)
6329 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6330 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6331 return &MI;
6332
6333 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6334}
6335
6337 unsigned OpNo) const {
6338 const MCInstrDesc &Desc = get(MI.getOpcode());
6339 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6340 Desc.operands()[OpNo].RegClass == -1) {
6341 Register Reg = MI.getOperand(OpNo).getReg();
6342
6343 if (Reg.isVirtual()) {
6344 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6345 return MRI.getRegClass(Reg);
6346 }
6347 return RI.getPhysRegBaseClass(Reg);
6348 }
6349
6350 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6351 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6352}
6353
6356 MachineBasicBlock *MBB = MI.getParent();
6357 MachineOperand &MO = MI.getOperand(OpIdx);
6358 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6359 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6360 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6361 unsigned Size = RI.getRegSizeInBits(*RC);
6362 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6363 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6364 : AMDGPU::V_MOV_B32_e32;
6365 if (MO.isReg())
6366 Opcode = AMDGPU::COPY;
6367 else if (RI.isSGPRClass(RC))
6368 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6369
6370 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6371 Register Reg = MRI.createVirtualRegister(VRC);
6372 DebugLoc DL = MBB->findDebugLoc(I);
6373 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6374 MO.ChangeToRegister(Reg, false);
6375}
6376
6379 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6380 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6381 if (!SuperReg.getReg().isVirtual())
6382 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6383
6384 MachineBasicBlock *MBB = MI->getParent();
6385 const DebugLoc &DL = MI->getDebugLoc();
6386 Register SubReg = MRI.createVirtualRegister(SubRC);
6387
6388 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6389 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6390 .addReg(SuperReg.getReg(), {}, NewSubIdx);
6391 return SubReg;
6392}
6393
6396 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6397 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6398 if (Op.isImm()) {
6399 if (SubIdx == AMDGPU::sub0)
6400 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6401 if (SubIdx == AMDGPU::sub1)
6402 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6403
6404 llvm_unreachable("Unhandled register index for immediate");
6405 }
6406
6407 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6408 SubIdx, SubRC);
6409 return MachineOperand::CreateReg(SubReg, false);
6410}
6411
6412// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6413void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6414 assert(Inst.getNumExplicitOperands() == 3);
6415 MachineOperand Op1 = Inst.getOperand(1);
6416 Inst.removeOperand(1);
6417 Inst.addOperand(Op1);
6418}
6419
6421 const MCOperandInfo &OpInfo,
6422 const MachineOperand &MO) const {
6423 if (!MO.isReg())
6424 return false;
6425
6426 Register Reg = MO.getReg();
6427
6428 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6429 if (Reg.isPhysical())
6430 return DRC->contains(Reg);
6431
6432 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6433
6434 if (MO.getSubReg()) {
6435 const MachineFunction *MF = MO.getParent()->getMF();
6436 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6437 if (!SuperRC)
6438 return false;
6439 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6440 }
6441
6442 return RI.getCommonSubClass(DRC, RC) != nullptr;
6443}
6444
6446 const MachineOperand &MO) const {
6447 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6448 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6449 unsigned Opc = MI.getOpcode();
6450
6451 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32or64BitOperand for more
6452 // information.
6453 if (AMDGPU::isPackedFP32or64BitInst(MI.getOpcode()) &&
6454 AMDGPU::isGFX12Plus(ST) && MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6455 constexpr AMDGPU::OpName OpNames[] = {
6456 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6457
6458 for (auto [I, OpName] : enumerate(OpNames)) {
6459 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6460 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6462 return false;
6463 }
6464 }
6465
6466 if (!isLegalRegOperand(MRI, OpInfo, MO))
6467 return false;
6468
6469 // check Accumulate GPR operand
6470 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6471 if (IsAGPR && !ST.hasMAIInsts())
6472 return false;
6473 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6474 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6475 return false;
6476 // Atomics should have both vdst and vdata either vgpr or agpr.
6477 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6478 const int DataIdx = AMDGPU::getNamedOperandIdx(
6479 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6480 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6481 MI.getOperand(DataIdx).isReg() &&
6482 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6483 return false;
6484 if ((int)OpIdx == DataIdx) {
6485 if (VDstIdx != -1 &&
6486 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6487 return false;
6488 // DS instructions with 2 src operands also must have tied RC.
6489 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6490 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6491 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6492 return false;
6493 }
6494
6495 // Check V_ACCVGPR_WRITE_B32_e64
6496 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6497 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6498 RI.isSGPRReg(MRI, MO.getReg()))
6499 return false;
6500
6501 if (ST.hasFlatScratchHiInB64InstHazard() &&
6502 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6503 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6504 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6505 64)
6506 return false;
6507 }
6508 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6509 return false;
6510 }
6511 if (!ST.hasDPPSrc1SGPR() && isDPP(MI) && RI.isSGPRReg(MRI, MO.getReg()) &&
6512 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1))
6513 return false;
6514
6515 return true;
6516}
6517
6519 const MCOperandInfo &OpInfo,
6520 const MachineOperand &MO) const {
6521 if (MO.isReg())
6522 return isLegalRegOperand(MRI, OpInfo, MO);
6523
6524 // Handle non-register types that are treated like immediates.
6525 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6526 return true;
6527}
6528
6530 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6531 const MachineOperand *MO) const {
6532 constexpr unsigned NumOps = 3;
6533 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6534 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6535 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6536 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6537
6538 assert(SrcN < NumOps);
6539
6540 if (!MO) {
6541 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6542 if (SrcIdx == -1)
6543 return true;
6544 MO = &MI.getOperand(SrcIdx);
6545 }
6546
6547 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6548 return true;
6549
6550 int ModsIdx =
6551 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6552 if (ModsIdx == -1)
6553 return false;
6554
6555 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6556 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6557 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6558
6559 return !OpSel && !OpSelHi;
6560}
6561
6563 const MachineOperand *MO) const {
6564 const MachineFunction &MF = *MI.getMF();
6565 const MachineRegisterInfo &MRI = MF.getRegInfo();
6566 const MCInstrDesc &InstDesc = MI.getDesc();
6567 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6568 int64_t RegClass = getOpRegClassID(OpInfo);
6569 const TargetRegisterClass *DefinedRC =
6570 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6571 if (!MO)
6572 MO = &MI.getOperand(OpIdx);
6573
6574 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6575
6576 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6577 const MachineOperand *UsedLiteral = nullptr;
6578
6579 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6580 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6581
6582 // TODO: Be more permissive with frame indexes.
6583 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6584 if (!LiteralLimit--)
6585 return false;
6586
6587 UsedLiteral = MO;
6588 }
6589
6591 if (MO->isReg())
6592 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6593
6594 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6595 if (i == OpIdx)
6596 continue;
6597 const MachineOperand &Op = MI.getOperand(i);
6598 if (Op.isReg()) {
6599 if (Op.isUse()) {
6600 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6601 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6602 if (--ConstantBusLimit <= 0)
6603 return false;
6604 }
6605 }
6606 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6607 !isInlineConstant(Op, InstDesc.operands()[i])) {
6608 // The same literal may be used multiple times.
6609 if (!UsedLiteral)
6610 UsedLiteral = &Op;
6611 else if (UsedLiteral->isIdenticalTo(Op))
6612 continue;
6613
6614 if (!LiteralLimit--)
6615 return false;
6616 if (--ConstantBusLimit <= 0)
6617 return false;
6618 }
6619 }
6620 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6621 // There can be at most one literal operand, but it can be repeated.
6622 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6623 if (i == OpIdx)
6624 continue;
6625 const MachineOperand &Op = MI.getOperand(i);
6626 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6627 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6628 !Op.isIdenticalTo(*MO))
6629 return false;
6630
6631 // Do not fold a non-inlineable and non-register operand into an
6632 // instruction that already has a frame index. The frame index handling
6633 // code could not handle well when a frame index co-exists with another
6634 // non-register operand, unless that operand is an inlineable immediate.
6635 if (Op.isFI())
6636 return false;
6637 }
6638 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6639 isF16PseudoScalarTrans(MI.getOpcode())) {
6640 return false;
6641 }
6642
6643 if (MO->isReg()) {
6644 if (!DefinedRC)
6645 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6646 return isLegalRegOperand(MI, OpIdx, *MO);
6647 }
6648
6649 if (MO->isImm()) {
6650 uint64_t Imm = MO->getImm();
6651 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
6652 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP64;
6653 bool Is64BitOp = Is64BitFPOp ||
6654 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6655 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6656 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32 ||
6657 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT64;
6658 if (Is64BitOp &&
6659 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6660 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6661 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6662 return false;
6663
6664 // FIXME: We can use sign extended 64-bit literals, but only for signed
6665 // operands. At the moment we do not know if an operand is signed.
6666 // Such operand will be encoded as its low 32 bits and then either
6667 // correctly sign extended or incorrectly zero extended by HW.
6668 // If 64-bit literals are supported and the literal will be encoded
6669 // as full 64 bit we still can use it.
6670 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6671 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6672 return false;
6673 }
6674 }
6675
6676 // Handle non-register types that are treated like immediates.
6677 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6678
6679 if (!DefinedRC) {
6680 // This operand expects an immediate.
6681 return true;
6682 }
6683
6684 return isImmOperandLegal(MI, OpIdx, *MO);
6685}
6686
6688 bool IsGFX950Only = ST.hasGFX950Insts();
6689 bool IsGFX940Only = ST.hasGFX940Insts();
6690
6691 if (!IsGFX950Only && !IsGFX940Only)
6692 return false;
6693
6694 if (!isVALU(MI))
6695 return false;
6696
6697 // V_COS, V_EXP, V_RCP, etc.
6698 if (isTRANS(MI))
6699 return true;
6700
6701 // DOT2, DOT2C, DOT4, etc.
6702 if (isDOT(MI))
6703 return true;
6704
6705 // MFMA, SMFMA
6706 if (isMFMA(MI))
6707 return true;
6708
6709 unsigned Opcode = MI.getOpcode();
6710 switch (Opcode) {
6711 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6712 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6713 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6714 case AMDGPU::V_MQSAD_U32_U8_e64:
6715 case AMDGPU::V_PK_ADD_F16:
6716 case AMDGPU::V_PK_ADD_F32:
6717 case AMDGPU::V_PK_ADD_I16:
6718 case AMDGPU::V_PK_ADD_U16:
6719 case AMDGPU::V_PK_ASHRREV_I16:
6720 case AMDGPU::V_PK_FMA_F16:
6721 case AMDGPU::V_PK_FMA_F32:
6722 case AMDGPU::V_PK_FMAC_F16_e32:
6723 case AMDGPU::V_PK_FMAC_F16_e64:
6724 case AMDGPU::V_PK_LSHLREV_B16:
6725 case AMDGPU::V_PK_LSHRREV_B16:
6726 case AMDGPU::V_PK_MAD_I16:
6727 case AMDGPU::V_PK_MAD_U16:
6728 case AMDGPU::V_PK_MAX_F16:
6729 case AMDGPU::V_PK_MAX_I16:
6730 case AMDGPU::V_PK_MAX_U16:
6731 case AMDGPU::V_PK_MIN_F16:
6732 case AMDGPU::V_PK_MIN_I16:
6733 case AMDGPU::V_PK_MIN_U16:
6734 case AMDGPU::V_PK_MOV_B32:
6735 case AMDGPU::V_PK_MUL_F16:
6736 case AMDGPU::V_PK_MUL_F32:
6737 case AMDGPU::V_PK_MUL_LO_U16:
6738 case AMDGPU::V_PK_SUB_I16:
6739 case AMDGPU::V_PK_SUB_U16:
6740 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6741 return true;
6742 default:
6743 return false;
6744 }
6745}
6746
6748 MachineInstr &MI) const {
6749 unsigned Opc = MI.getOpcode();
6750 const MCInstrDesc &InstrDesc = get(Opc);
6751
6752 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6753 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6754
6755 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6756 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6757
6758 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6759 // we need to only have one constant bus use before GFX10.
6760 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6761 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6762 RI.isSGPRReg(MRI, Src0.getReg()))
6763 legalizeOpWithMove(MI, Src0Idx);
6764
6765 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6766 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6767 // src0/src1 with V_READFIRSTLANE.
6768 if (Opc == AMDGPU::V_WRITELANE_B32) {
6769 const DebugLoc &DL = MI.getDebugLoc();
6770 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6771 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6772 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6773 .add(Src0);
6774 Src0.ChangeToRegister(Reg, false);
6775 }
6776 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6777 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6778 const DebugLoc &DL = MI.getDebugLoc();
6779 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6780 .add(Src1);
6781 Src1.ChangeToRegister(Reg, false);
6782 }
6783 return;
6784 }
6785
6786 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6787 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6788 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6789 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6790 legalizeOpWithMove(MI, Src2Idx);
6791 }
6792
6793 // VOP2 src0 instructions support all operand types, so we don't need to check
6794 // their legality. If src1 is already legal, we don't need to do anything.
6795 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6796 return;
6797
6798 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6799 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6800 // select is uniform.
6801 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6802 RI.isVGPR(MRI, Src1.getReg())) {
6803 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6804 const DebugLoc &DL = MI.getDebugLoc();
6805 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6806 .add(Src1);
6807 Src1.ChangeToRegister(Reg, false);
6808 return;
6809 }
6810
6811 // We do not use commuteInstruction here because it is too aggressive and will
6812 // commute if it is possible. We only want to commute here if it improves
6813 // legality. This can be called a fairly large number of times so don't waste
6814 // compile time pointlessly swapping and checking legality again.
6815 if (HasImplicitSGPR || !MI.isCommutable()) {
6816 legalizeOpWithMove(MI, Src1Idx);
6817 return;
6818 }
6819
6820 // If src0 can be used as src1, commuting will make the operands legal.
6821 // Otherwise we have to give up and insert a move.
6822 //
6823 // TODO: Other immediate-like operand kinds could be commuted if there was a
6824 // MachineOperand::ChangeTo* for them.
6825 if ((!Src1.isImm() && !Src1.isReg()) ||
6826 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6827 legalizeOpWithMove(MI, Src1Idx);
6828 return;
6829 }
6830
6831 int CommutedOpc = commuteOpcode(MI);
6832 if (CommutedOpc == -1) {
6833 legalizeOpWithMove(MI, Src1Idx);
6834 return;
6835 }
6836
6837 MI.setDesc(get(CommutedOpc));
6838
6839 Register Src0Reg = Src0.getReg();
6840 unsigned Src0SubReg = Src0.getSubReg();
6841 bool Src0Kill = Src0.isKill();
6842
6843 if (Src1.isImm())
6844 Src0.ChangeToImmediate(Src1.getImm());
6845 else if (Src1.isReg()) {
6846 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6847 Src0.setSubReg(Src1.getSubReg());
6848 } else
6849 llvm_unreachable("Should only have register or immediate operands");
6850
6851 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6852 Src1.setSubReg(Src0SubReg);
6854}
6855
6856// Legalize VOP3 operands. All operand types are supported for any operand
6857// but only one literal constant and only starting from GFX10.
6859 MachineInstr &MI) const {
6860 unsigned Opc = MI.getOpcode();
6861
6862 int VOP3Idx[3] = {
6863 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6864 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6865 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6866 };
6867
6868 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6869 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6870 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6871 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6872 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6873 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6874 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6875 // src1 and src2 must be scalar
6876 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6877 const DebugLoc &DL = MI.getDebugLoc();
6878 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6879 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6880 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6881 .add(Src1);
6882 Src1.ChangeToRegister(Reg, false);
6883 }
6884 if (VOP3Idx[2] != -1) {
6885 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6886 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6887 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6888 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6889 .add(Src2);
6890 Src2.ChangeToRegister(Reg, false);
6891 }
6892 }
6893 }
6894
6895 // Find the one SGPR operand we are allowed to use.
6896 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6897 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6898 SmallDenseSet<unsigned> SGPRsUsed;
6899 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6900 if (SGPRReg) {
6901 SGPRsUsed.insert(SGPRReg);
6902 --ConstantBusLimit;
6903 }
6904
6905 for (int Idx : VOP3Idx) {
6906 if (Idx == -1)
6907 break;
6908 MachineOperand &MO = MI.getOperand(Idx);
6909
6910 if (!MO.isReg()) {
6911 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6912 continue;
6913
6914 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6915 --LiteralLimit;
6916 --ConstantBusLimit;
6917 continue;
6918 }
6919
6920 --LiteralLimit;
6921 --ConstantBusLimit;
6922 legalizeOpWithMove(MI, Idx);
6923 continue;
6924 }
6925
6926 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6927 continue; // VGPRs are legal
6928
6929 // We can use one SGPR in each VOP3 instruction prior to GFX10
6930 // and two starting from GFX10.
6931 if (SGPRsUsed.count(MO.getReg()))
6932 continue;
6933 if (ConstantBusLimit > 0) {
6934 SGPRsUsed.insert(MO.getReg());
6935 --ConstantBusLimit;
6936 continue;
6937 }
6938
6939 // If we make it this far, then the operand is not legal and we must
6940 // legalize it.
6941 legalizeOpWithMove(MI, Idx);
6942 }
6943
6944 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6945 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6946 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6947 legalizeOpWithMove(MI, VOP3Idx[2]);
6948
6949 // Fix the register class of packed FP32 instructions on gfx12+. See
6950 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32or64BitOperand for more
6951 // information.
6953 for (unsigned I = 0; I < 3; ++I) {
6955 legalizeOpWithMove(MI, VOP3Idx[I]);
6956 }
6957 }
6958}
6959
6962 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6963 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6964 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6965 if (DstRC)
6966 SRC = RI.getCommonSubClass(SRC, DstRC);
6967
6968 Register DstReg = MRI.createVirtualRegister(SRC);
6969 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6970
6971 if (RI.hasAGPRs(VRC)) {
6972 VRC = RI.getEquivalentVGPRClass(VRC);
6973 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6974 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6975 get(TargetOpcode::COPY), NewSrcReg)
6976 .addReg(SrcReg);
6977 SrcReg = NewSrcReg;
6978 }
6979
6980 if (SubRegs == 1) {
6981 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6982 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6983 .addReg(SrcReg);
6984 return DstReg;
6985 }
6986
6988 for (unsigned i = 0; i < SubRegs; ++i) {
6989 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6990 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6991 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6992 .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
6993 SRegs.push_back(SGPR);
6994 }
6995
6997 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6998 get(AMDGPU::REG_SEQUENCE), DstReg);
6999 for (unsigned i = 0; i < SubRegs; ++i) {
7000 MIB.addReg(SRegs[i]);
7001 MIB.addImm(RI.getSubRegFromChannel(i));
7002 }
7003 return DstReg;
7004}
7005
7007 MachineInstr &MI) const {
7008
7009 // If the pointer is store in VGPRs, then we need to move them to
7010 // SGPRs using v_readfirstlane. This is safe because we only select
7011 // loads with uniform pointers to SMRD instruction so we know the
7012 // pointer value is uniform.
7013 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
7014 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
7015 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
7016 SBase->setReg(SGPR);
7017 }
7018 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
7019 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
7020 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
7021 SOff->setReg(SGPR);
7022 }
7023}
7024
7026 unsigned Opc = Inst.getOpcode();
7027 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
7028 if (OldSAddrIdx < 0)
7029 return false;
7030
7031 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
7032
7033 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
7034 if (NewOpc < 0)
7036 if (NewOpc < 0)
7037 return false;
7038
7039 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
7040 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
7041 if (RI.isSGPRReg(MRI, SAddr.getReg()))
7042 return false;
7043
7044 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
7045 if (NewVAddrIdx < 0)
7046 return false;
7047
7048 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
7049
7050 // Check vaddr, it shall be zero or absent.
7051 MachineInstr *VAddrDef = nullptr;
7052 if (OldVAddrIdx >= 0) {
7053 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
7054 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
7055 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
7056 !VAddrDef->getOperand(1).isImm() ||
7057 VAddrDef->getOperand(1).getImm() != 0)
7058 return false;
7059 }
7060
7061 const MCInstrDesc &NewDesc = get(NewOpc);
7062 Inst.setDesc(NewDesc);
7063
7064 // Callers expect iterator to be valid after this call, so modify the
7065 // instruction in place.
7066 if (OldVAddrIdx == NewVAddrIdx) {
7067 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
7068 // Clear use list from the old vaddr holding a zero register.
7069 MRI.removeRegOperandFromUseList(&NewVAddr);
7070 MRI.moveOperands(&NewVAddr, &SAddr, 1);
7071 Inst.removeOperand(OldSAddrIdx);
7072 // Update the use list with the pointer we have just moved from vaddr to
7073 // saddr position. Otherwise new vaddr will be missing from the use list.
7074 MRI.removeRegOperandFromUseList(&NewVAddr);
7075 MRI.addRegOperandToUseList(&NewVAddr);
7076 } else {
7077 assert(OldSAddrIdx == NewVAddrIdx);
7078
7079 if (OldVAddrIdx >= 0) {
7080 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
7081 AMDGPU::OpName::vdst_in);
7082
7083 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
7084 // it asserts. Untie the operands for now and retie them afterwards.
7085 if (NewVDstIn != -1) {
7086 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
7087 Inst.untieRegOperand(OldVDstIn);
7088 }
7089
7090 Inst.removeOperand(OldVAddrIdx);
7091
7092 if (NewVDstIn != -1) {
7093 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
7094 Inst.tieOperands(NewVDst, NewVDstIn);
7095 }
7096 }
7097 }
7098
7099 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
7100 VAddrDef->eraseFromParent();
7101
7102 return true;
7103}
7104
7105// FIXME: Remove this when SelectionDAG is obsoleted.
7107 MachineInstr &MI) const {
7108 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
7109 return;
7110
7111 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
7112 // thinks they are uniform, so a readfirstlane should be valid.
7113 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
7114 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
7115 return;
7116
7118 return;
7119
7120 const TargetRegisterClass *DeclaredRC =
7121 getRegClass(MI.getDesc(), SAddr->getOperandNo());
7122
7123 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
7124 SAddr->setReg(ToSGPR);
7125}
7126
7129 const TargetRegisterClass *DstRC,
7132 const DebugLoc &DL) const {
7133 Register OpReg = Op.getReg();
7134 unsigned OpSubReg = Op.getSubReg();
7135
7136 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7137 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
7138
7139 // Check if operand is already the correct register class.
7140 if (DstRC == OpRC)
7141 return;
7142
7143 Register DstReg = MRI.createVirtualRegister(DstRC);
7144 auto Copy =
7145 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
7146 Op.setReg(DstReg);
7147
7148 MachineInstr *Def = MRI.getVRegDef(OpReg);
7149 if (!Def)
7150 return;
7151
7152 // Try to eliminate the copy if it is copying an immediate value.
7153 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7154 foldImmediate(*Copy, *Def, OpReg, &MRI);
7155
7156 bool ImpDef = Def->isImplicitDef();
7157 while (!ImpDef && Def && Def->isCopy()) {
7158 if (Def->getOperand(1).getReg().isPhysical())
7159 break;
7160 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
7161 ImpDef = Def && Def->isImplicitDef();
7162 }
7163 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
7164 !ImpDef)
7165 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
7166}
7167
7168// Emit the actual waterfall loop, executing the wrapped instruction for each
7169// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7170// iteration, in the worst case we execute 64 (once per lane).
7173 MachineBasicBlock &BodyBB, const DebugLoc &DL,
7174 ArrayRef<MachineOperand *> ScalarOps, ArrayRef<Register> PhySGPRs = {}) {
7175 MachineFunction &MF = *LoopBB.getParent();
7177 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7179 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7180
7182 Register CondReg;
7183 for (auto [Idx, ScalarOp] : enumerate(ScalarOps)) {
7184 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7185 unsigned NumSubRegs = RegSize / 32;
7186 Register VScalarOp = ScalarOp->getReg();
7187
7188 const TargetRegisterClass *RFLSrcRC =
7189 TII.getRegClass(TII.get(AMDGPU::V_READFIRSTLANE_B32), 1);
7190
7191 if (NumSubRegs == 1) {
7192 const TargetRegisterClass *VScalarOpRC = MRI.getRegClass(VScalarOp);
7193 if (const TargetRegisterClass *Common =
7194 TRI->getCommonSubClass(VScalarOpRC, RFLSrcRC);
7195 Common != VScalarOpRC) {
7196 Register VRReg = MRI.createVirtualRegister(Common);
7197 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::COPY), VRReg).addReg(VScalarOp);
7198 VScalarOp = VRReg;
7199 }
7200 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7201
7202 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7203 .addReg(VScalarOp);
7204
7205 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7206
7207 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7208 .addReg(CurReg)
7209 .addReg(VScalarOp);
7210
7211 // Combine the comparison results with AND.
7212 if (!CondReg) // First.
7213 CondReg = NewCondReg;
7214 else { // If not the first, we create an AND.
7215 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7216 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7217 .addReg(CondReg)
7218 .addReg(NewCondReg);
7219 CondReg = AndReg;
7220 }
7221
7222 // Update ScalarOp operand to use the SGPR ScalarOp.
7223 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7224 ScalarOp->setReg(CurReg);
7225 else {
7226 // Insert into the same block of use
7227 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7228 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7229 .addReg(CurReg);
7230 ScalarOp->setReg(PhySGPRs[Idx]);
7231 }
7232 ScalarOp->setIsKill();
7233 } else {
7234 SmallVector<Register, 8> ReadlanePieces;
7235 RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7236 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7237 "Unhandled register size");
7238
7239 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7240 Register CurRegLo =
7241 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7242 Register CurRegHi =
7243 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7244
7245 // Read the next variant <- also loop target.
7246 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7247 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7248
7249 // Read the next variant <- also loop target.
7250 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7251 .addReg(VScalarOp, VScalarOpUndef,
7252 TRI->getSubRegFromChannel(Idx + 1));
7253
7254 ReadlanePieces.push_back(CurRegLo);
7255 ReadlanePieces.push_back(CurRegHi);
7256
7257 // Comparison is to be done as 64-bit.
7258 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7259 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7260 .addReg(CurRegLo)
7261 .addImm(AMDGPU::sub0)
7262 .addReg(CurRegHi)
7263 .addImm(AMDGPU::sub1);
7264
7265 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7266 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7267 NewCondReg)
7268 .addReg(CurReg);
7269 if (NumSubRegs <= 2)
7270 Cmp.addReg(VScalarOp);
7271 else
7272 Cmp.addReg(VScalarOp, VScalarOpUndef,
7273 TRI->getSubRegFromChannel(Idx, 2));
7274
7275 // Combine the comparison results with AND.
7276 if (!CondReg) // First.
7277 CondReg = NewCondReg;
7278 else { // If not the first, we create an AND.
7279 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7280 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7281 .addReg(CondReg)
7282 .addReg(NewCondReg);
7283 CondReg = AndReg;
7284 }
7285 } // End for loop.
7286
7287 const auto *SScalarOpRC =
7288 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7289 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7290
7291 // Build scalar ScalarOp.
7292 auto Merge =
7293 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7294 unsigned Channel = 0;
7295 for (Register Piece : ReadlanePieces) {
7296 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7297 }
7298
7299 // Update ScalarOp operand to use the SGPR ScalarOp.
7300 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7301 ScalarOp->setReg(SScalarOp);
7302 else {
7303 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7304 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7305 .addReg(SScalarOp);
7306 ScalarOp->setReg(PhySGPRs[Idx]);
7307 }
7308 ScalarOp->setIsKill();
7309 }
7310 }
7311
7312 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7313 MRI.setSimpleHint(SaveExec, CondReg);
7314
7315 // Update EXEC to matching lanes, saving original to SaveExec.
7316 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7317 .addReg(CondReg, RegState::Kill);
7318
7319 // The original instruction is here; we insert the terminators after it.
7320 I = BodyBB.end();
7321
7322 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7323 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7324 .addReg(LMC.ExecReg)
7325 .addReg(SaveExec);
7326
7327 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7328}
7329
7330// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7331// with SGPRs by iterating over all unique values across all lanes.
7332// Returns the loop basic block that now contains \p MI.
7333static MachineBasicBlock *
7337 MachineBasicBlock::iterator Begin = nullptr,
7338 MachineBasicBlock::iterator End = nullptr,
7339 ArrayRef<Register> PhySGPRs = {}) {
7340 assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) &&
7341 "Physical SGPRs must be empty or match the number of scalar operands");
7342 MachineBasicBlock &MBB = *MI.getParent();
7343 MachineFunction &MF = *MBB.getParent();
7345 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7346 MachineRegisterInfo &MRI = MF.getRegInfo();
7347 if (!Begin.isValid())
7348 Begin = &MI;
7349 if (!End.isValid()) {
7350 End = &MI;
7351 ++End;
7352 }
7353 const DebugLoc &DL = MI.getDebugLoc();
7355 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7356
7357 // Save SCC. Waterfall Loop may overwrite SCC.
7358 Register SaveSCCReg;
7359
7360 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7361 // rather than unlimited scan everywhere
7362 bool SCCNotDead =
7363 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7364 std::numeric_limits<unsigned>::max()) !=
7366 if (SCCNotDead) {
7367 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7368 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7369 .addImm(1)
7370 .addImm(0);
7371 }
7372
7373 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7374
7375 // Save the EXEC mask
7376 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7377
7378 // Killed uses in the instruction we are waterfalling around will be
7379 // incorrect due to the added control-flow.
7381 ++AfterMI;
7382 for (auto I = Begin; I != AfterMI; I++) {
7383 for (auto &MO : I->all_uses())
7384 MRI.clearKillFlags(MO.getReg());
7385 }
7386
7387 // To insert the loop we need to split the block. Move everything after this
7388 // point to a new block, and insert a new empty block between the two.
7391 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7393 ++MBBI;
7394
7395 MF.insert(MBBI, LoopBB);
7396 MF.insert(MBBI, BodyBB);
7397 MF.insert(MBBI, RemainderBB);
7398
7399 LoopBB->addSuccessor(BodyBB);
7400 BodyBB->addSuccessor(LoopBB);
7401 BodyBB->addSuccessor(RemainderBB);
7402
7403 // Move Begin to MI to the BodyBB, and the remainder of the block to
7404 // RemainderBB.
7405 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7406 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7407 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7408
7409 MBB.addSuccessor(LoopBB);
7410
7411 // Update dominators. We know that MBB immediately dominates LoopBB, that
7412 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7413 // RemainderBB. RemainderBB immediately dominates all of the successors
7414 // transferred to it from MBB that MBB used to properly dominate.
7415 if (MDT) {
7416 MDT->addNewBlock(LoopBB, &MBB);
7417 MDT->addNewBlock(BodyBB, LoopBB);
7418 MDT->addNewBlock(RemainderBB, BodyBB);
7419 for (auto &Succ : RemainderBB->successors()) {
7420 if (MDT->properlyDominates(&MBB, Succ)) {
7421 MDT->changeImmediateDominator(Succ, RemainderBB);
7422 }
7423 }
7424 }
7425
7426 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps,
7427 PhySGPRs);
7428
7429 MachineBasicBlock::iterator First = RemainderBB->begin();
7430 // Restore SCC
7431 if (SCCNotDead) {
7432 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7433 .addReg(SaveSCCReg, RegState::Kill)
7434 .addImm(0);
7435 }
7436
7437 // Restore the EXEC mask
7438 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7439 .addReg(SaveExec);
7440 return BodyBB;
7441}
7442
7443// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7444static std::tuple<unsigned, unsigned>
7446 MachineBasicBlock &MBB = *MI.getParent();
7447 MachineFunction &MF = *MBB.getParent();
7448 MachineRegisterInfo &MRI = MF.getRegInfo();
7449
7450 // Extract the ptr from the resource descriptor.
7451 unsigned RsrcPtr =
7452 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7453 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7454
7455 // Create an empty resource descriptor
7456 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7457 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7458 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7459 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7460 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7461
7462 // Zero64 = 0
7463 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7464 .addImm(0);
7465
7466 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7467 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7468 .addImm(Lo_32(RsrcDataFormat));
7469
7470 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7471 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7472 .addImm(Hi_32(RsrcDataFormat));
7473
7474 // NewSRsrc = {Zero64, SRsrcFormat}
7475 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7476 .addReg(Zero64)
7477 .addImm(AMDGPU::sub0_sub1)
7478 .addReg(SRsrcFormatLo)
7479 .addImm(AMDGPU::sub2)
7480 .addReg(SRsrcFormatHi)
7481 .addImm(AMDGPU::sub3);
7482
7483 return std::tuple(RsrcPtr, NewSRsrc);
7484}
7485
7488 MachineDominatorTree *MDT) const {
7489 MachineFunction &MF = *MI.getMF();
7490 MachineRegisterInfo &MRI = MF.getRegInfo();
7491 MachineBasicBlock *CreatedBB = nullptr;
7492
7493 // Legalize VOP2
7494 if (isVOP2(MI) || isVOPC(MI)) {
7496 return CreatedBB;
7497 }
7498
7499 // Legalize VOP3
7500 if (isVOP3(MI)) {
7502 return CreatedBB;
7503 }
7504
7505 // Legalize SMRD
7506 if (isSMRD(MI)) {
7508 return CreatedBB;
7509 }
7510
7511 // Legalize FLAT
7512 if (isFLAT(MI)) {
7514 return CreatedBB;
7515 }
7516
7517 // Legalize PHI
7518 // The register class of the operands must be the same type as the register
7519 // class of the output.
7520 if (MI.getOpcode() == AMDGPU::PHI) {
7521 const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
7522 assert(!RI.isSGPRClass(VRC));
7523
7524 // Update all the operands so they have the same type.
7525 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7526 MachineOperand &Op = MI.getOperand(I);
7527 if (!Op.isReg() || !Op.getReg().isVirtual())
7528 continue;
7529
7530 // MI is a PHI instruction.
7531 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7533
7534 // Avoid creating no-op copies with the same src and dst reg class. These
7535 // confuse some of the machine passes.
7536 legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
7537 }
7538 }
7539
7540 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7541 // VGPR dest type and SGPR sources, insert copies so all operands are
7542 // VGPRs. This seems to help operand folding / the register coalescer.
7543 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7544 MachineBasicBlock *MBB = MI.getParent();
7545 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7546 if (RI.hasVGPRs(DstRC)) {
7547 // Update all the operands so they are VGPR register classes. These may
7548 // not be the same register class because REG_SEQUENCE supports mixing
7549 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7550 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7551 MachineOperand &Op = MI.getOperand(I);
7552 if (!Op.isReg() || !Op.getReg().isVirtual())
7553 continue;
7554
7555 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7556 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7557 if (VRC == OpRC)
7558 continue;
7559
7560 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7561 Op.setIsKill();
7562 }
7563 }
7564
7565 return CreatedBB;
7566 }
7567
7568 // Legalize INSERT_SUBREG
7569 // src0 must have the same register class as dst
7570 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7571 Register Dst = MI.getOperand(0).getReg();
7572 Register Src0 = MI.getOperand(1).getReg();
7573 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7574 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7575 if (DstRC != Src0RC) {
7576 MachineBasicBlock *MBB = MI.getParent();
7577 MachineOperand &Op = MI.getOperand(1);
7578 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7579 }
7580 return CreatedBB;
7581 }
7582
7583 // Legalize SI_INIT_M0
7584 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7585 MachineOperand &Src = MI.getOperand(0);
7586 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7587 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7588 return CreatedBB;
7589 }
7590
7591 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7592 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7593 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7594 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7595 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7596 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7597 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7598 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7599 MachineOperand &Src = MI.getOperand(1);
7600 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7601 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7602 return CreatedBB;
7603 }
7604
7605 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7606 //
7607 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7608 // scratch memory access. In both cases, the legalization never involves
7609 // conversion to the addr64 form.
7611 (isMUBUF(MI) || isMTBUF(MI)))) {
7612 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7613 ? AMDGPU::OpName::rsrc
7614 : AMDGPU::OpName::srsrc;
7615 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7616 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7617 CreatedBB = generateWaterFallLoop(*this, MI, {SRsrc}, MDT);
7618
7619 AMDGPU::OpName SampOpName =
7620 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7621 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7622 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7623 CreatedBB = generateWaterFallLoop(*this, MI, {SSamp}, MDT);
7624
7625 return CreatedBB;
7626 }
7627
7628 // Legalize SI_CALL
7629 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7630 MachineOperand *Dest = &MI.getOperand(0);
7631 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7632 createWaterFallForSiCall(&MI, MDT, {Dest});
7633 }
7634 }
7635
7636 // Legalize s_sleep_var.
7637 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7638 const DebugLoc &DL = MI.getDebugLoc();
7639 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7640 int Src0Idx =
7641 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7642 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7643 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7644 .add(Src0);
7645 Src0.ChangeToRegister(Reg, false);
7646 return nullptr;
7647 }
7648
7649 // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
7650 // operands are scalar.
7651 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
7652 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
7653 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
7654 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
7655 for (MachineOperand &Src : MI.explicit_operands()) {
7656 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7657 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7658 }
7659 return CreatedBB;
7660 }
7661
7662 // Legalize MUBUF instructions.
7663 bool isSoffsetLegal = true;
7664 int SoffsetIdx =
7665 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7666 if (SoffsetIdx != -1) {
7667 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7668 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7669 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7670 isSoffsetLegal = false;
7671 }
7672 }
7673
7674 bool isRsrcLegal = true;
7675 int RsrcIdx =
7676 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7677 if (RsrcIdx != -1) {
7678 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7679 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7680 isRsrcLegal = false;
7681 }
7682
7683 // The operands are legal.
7684 if (isRsrcLegal && isSoffsetLegal)
7685 return CreatedBB;
7686
7687 if (!isRsrcLegal) {
7688 // Legalize a VGPR Rsrc
7689 //
7690 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7691 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7692 // a zero-value SRsrc.
7693 //
7694 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7695 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7696 // above.
7697 //
7698 // Otherwise we are on non-ADDR64 hardware, and/or we have
7699 // idxen/offen/bothen and we fall back to a waterfall loop.
7700
7701 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7702 MachineBasicBlock &MBB = *MI.getParent();
7703
7704 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7705 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7706 // This is already an ADDR64 instruction so we need to add the pointer
7707 // extracted from the resource descriptor to the current value of VAddr.
7708 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7709 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7710 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7711
7712 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7713 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7714 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7715
7716 unsigned RsrcPtr, NewSRsrc;
7717 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7718
7719 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7720 const DebugLoc &DL = MI.getDebugLoc();
7721 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7722 .addDef(CondReg0)
7723 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7724 .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
7725 .addImm(0);
7726
7727 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7728 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7729 .addDef(CondReg1, RegState::Dead)
7730 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7731 .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
7732 .addReg(CondReg0, RegState::Kill)
7733 .addImm(0);
7734
7735 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7736 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7737 .addReg(NewVAddrLo)
7738 .addImm(AMDGPU::sub0)
7739 .addReg(NewVAddrHi)
7740 .addImm(AMDGPU::sub1);
7741
7742 VAddr->setReg(NewVAddr);
7743 Rsrc->setReg(NewSRsrc);
7744 } else if (!VAddr && ST.hasAddr64()) {
7745 // This instructions is the _OFFSET variant, so we need to convert it to
7746 // ADDR64.
7747 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7748 "FIXME: Need to emit flat atomics here");
7749
7750 unsigned RsrcPtr, NewSRsrc;
7751 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7752
7753 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7754 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7755 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7756 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7757 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7758
7759 // Atomics with return have an additional tied operand and are
7760 // missing some of the special bits.
7761 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7762 MachineInstr *Addr64;
7763
7764 if (!VDataIn) {
7765 // Regular buffer load / store.
7767 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7768 .add(*VData)
7769 .addReg(NewVAddr)
7770 .addReg(NewSRsrc)
7771 .add(*SOffset)
7772 .add(*Offset);
7773
7774 if (const MachineOperand *CPol =
7775 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7776 MIB.addImm(CPol->getImm());
7777 }
7778
7779 if (const MachineOperand *TFE =
7780 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7781 MIB.addImm(TFE->getImm());
7782 }
7783
7784 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7785
7786 MIB.cloneMemRefs(MI);
7787 Addr64 = MIB;
7788 } else {
7789 // Atomics with return.
7790 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7791 .add(*VData)
7792 .add(*VDataIn)
7793 .addReg(NewVAddr)
7794 .addReg(NewSRsrc)
7795 .add(*SOffset)
7796 .add(*Offset)
7797 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7798 .cloneMemRefs(MI);
7799 }
7800
7801 MI.removeFromParent();
7802
7803 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7804 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7805 NewVAddr)
7806 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7807 .addImm(AMDGPU::sub0)
7808 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7809 .addImm(AMDGPU::sub1);
7810 } else {
7811 // Legalize a VGPR Rsrc and soffset together.
7812 if (!isSoffsetLegal) {
7813 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7814 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc, Soffset}, MDT);
7815 return CreatedBB;
7816 }
7817 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc}, MDT);
7818 return CreatedBB;
7819 }
7820 }
7821
7822 // Legalize a VGPR soffset.
7823 if (!isSoffsetLegal) {
7824 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7825 CreatedBB = generateWaterFallLoop(*this, MI, {Soffset}, MDT);
7826 return CreatedBB;
7827 }
7828 return CreatedBB;
7829}
7830
7832 InstrList.insert(MI);
7833 // Add MBUF instructiosn to deferred list.
7834 int RsrcIdx =
7835 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7836 if (RsrcIdx != -1) {
7837 DeferredList.insert(MI);
7838 }
7839}
7840
7842 return DeferredList.contains(MI);
7843}
7844
7845// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7846// lowering (change sgpr to vgpr).
7847// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7848// size. Need to legalize the size of the operands during the vgpr lowering
7849// chain. This can be removed after we have sgpr16 in place
7851 MachineRegisterInfo &MRI) const {
7852 if (!ST.useRealTrue16Insts())
7853 return;
7854
7855 unsigned Opcode = MI.getOpcode();
7856 MachineBasicBlock *MBB = MI.getParent();
7857 // Legalize operands and check for size mismatch
7858 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7859 OpIdx >= get(Opcode).getNumOperands() ||
7860 get(Opcode).operands()[OpIdx].RegClass == -1)
7861 return;
7862
7863 MachineOperand &Op = MI.getOperand(OpIdx);
7864 if (!Op.isReg() || !Op.getReg().isVirtual())
7865 return;
7866
7867 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7868 if (!RI.isVGPRClass(CurrRC))
7869 return;
7870
7871 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7872 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7873 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7874 Op.setSubReg(AMDGPU::lo16);
7875 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7876 const DebugLoc &DL = MI.getDebugLoc();
7877 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7878 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7879 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7880 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7881 .addReg(Op.getReg())
7882 .addImm(AMDGPU::lo16)
7883 .addReg(Undef)
7884 .addImm(AMDGPU::hi16);
7885 Op.setReg(NewDstReg);
7886 }
7887}
7889 MachineRegisterInfo &MRI) const {
7890 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7892}
7893
7897 ArrayRef<Register> PhySGPRs) const {
7898 assert(MI->getOpcode() == AMDGPU::SI_CALL_ISEL &&
7899 "This only handle waterfall for SI_CALL_ISEL");
7900 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7901 // following copies, we also need to move copies from and to physical
7902 // registers into the loop block.
7903 // Also move the copies to physical registers into the loop block
7904 MachineBasicBlock &MBB = *MI->getParent();
7906 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
7907 --Start;
7909 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
7910 ++End;
7911
7912 // Also include following copies of the return value
7913 ++End;
7914 while (End != MBB.end() && End->isCopy() &&
7915 MI->definesRegister(End->getOperand(1).getReg(), &RI))
7916 ++End;
7917
7918 generateWaterFallLoop(*this, *MI, ScalarOps, MDT, Start, End, PhySGPRs);
7919}
7920
7922 MachineDominatorTree *MDT) const {
7924 DenseMap<MachineInstr *, bool> V2SPhyCopiesToErase;
7925 while (!Worklist.empty()) {
7926 MachineInstr &Inst = *Worklist.top();
7927 Worklist.erase_top();
7928 // Skip MachineInstr in the deferred list.
7929 if (Worklist.isDeferred(&Inst))
7930 continue;
7931 moveToVALUImpl(Worklist, MDT, Inst, WaterFalls, V2SPhyCopiesToErase);
7932 }
7933
7934 // Deferred list of instructions will be processed once
7935 // all the MachineInstr in the worklist are done.
7936 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7937 moveToVALUImpl(Worklist, MDT, *Inst, WaterFalls, V2SPhyCopiesToErase);
7938 assert(Worklist.empty() &&
7939 "Deferred MachineInstr are not supposed to re-populate worklist");
7940 }
7941
7942 for (std::pair<MachineInstr *, V2PhysSCopyInfo> &Entry : WaterFalls) {
7943 if (Entry.first->getOpcode() == AMDGPU::SI_CALL_ISEL)
7944 createWaterFallForSiCall(Entry.first, MDT, Entry.second.MOs,
7945 Entry.second.SGPRs);
7946 }
7947
7948 for (std::pair<MachineInstr *, bool> Entry : V2SPhyCopiesToErase)
7949 if (Entry.second)
7950 Entry.first->eraseFromParent();
7951}
7953 MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const {
7954 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7955 // hope for the best.
7956 const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI, DstReg);
7957 ArrayRef<int16_t> SubRegIndices = RI.getRegSplitParts(DstRC, 4);
7958 if (SubRegIndices.size() <= 1) {
7959 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7960 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7961 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7962 .add(Inst.getOperand(1));
7963 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
7964 DstReg)
7965 .addReg(NewDst);
7966 } else {
7968 for (int16_t Indice : SubRegIndices) {
7969 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7970 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7971 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7972 .addReg(Inst.getOperand(1).getReg(), {}, Indice);
7973
7974 DstRegs.push_back(NewDst);
7975 }
7977 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7978 get(AMDGPU::REG_SEQUENCE), DstReg);
7979 for (unsigned i = 0; i < SubRegIndices.size(); ++i) {
7980 MIB.addReg(DstRegs[i]);
7981 MIB.addImm(RI.getSubRegFromChannel(i));
7982 }
7983 }
7984}
7985
7987 SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst,
7990 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7991 if (DstReg == AMDGPU::M0) {
7992 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7993 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7994 return;
7995 }
7996 Register SrcReg = Inst.getOperand(1).getReg();
7999 // Only search current block since phyreg's def & use cannot cross
8000 // blocks when MF.NoPhi = false.
8001 while (++I != E) {
8002 // For SI_CALL_ISEL users, replace the phys SGPR with the VGPR source
8003 // and record the operand for later waterfall loop generation.
8004 if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) {
8005 MachineInstr *UseMI = &*I;
8006 for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) {
8007 if (UseMI->getOperand(i).isReg() &&
8008 UseMI->getOperand(i).getReg() == DstReg) {
8009 MachineOperand *MO = &UseMI->getOperand(i);
8010 MO->setReg(SrcReg);
8011 V2PhysSCopyInfo &V2SCopyInfo = WaterFalls[UseMI];
8012 V2SCopyInfo.MOs.push_back(MO);
8013 V2SCopyInfo.SGPRs.push_back(DstReg);
8014 V2SPhyCopiesToErase.try_emplace(&Inst, true);
8015 }
8016 }
8017 } else if (I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG &&
8018 I->getOperand(0).isReg() &&
8019 I->getOperand(0).getReg() == DstReg) {
8020 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
8021 V2SPhyCopiesToErase.try_emplace(&Inst, true);
8022 } else if (I->readsRegister(DstReg, &RI)) {
8023 // COPY cannot be erased if other type of inst uses it.
8024 V2SPhyCopiesToErase[&Inst] = false;
8025 }
8026 if (I->findRegisterDefOperand(DstReg, &RI))
8027 break;
8028 }
8029}
8030
8032 SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst,
8034 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
8035
8037 if (!MBB)
8038 return;
8039 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
8040 unsigned Opcode = Inst.getOpcode();
8041 unsigned NewOpcode = getVALUOp(Inst);
8042 const DebugLoc &DL = Inst.getDebugLoc();
8043
8044 // Handle some special cases
8045 switch (Opcode) {
8046 default:
8047 break;
8048 case AMDGPU::S_ADD_I32:
8049 case AMDGPU::S_SUB_I32: {
8050 // FIXME: The u32 versions currently selected use the carry.
8051 bool Changed;
8052 MachineBasicBlock *CreatedBBTmp = nullptr;
8053 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
8054 if (Changed)
8055 return;
8056
8057 // Default handling
8058 break;
8059 }
8060
8061 case AMDGPU::S_MUL_U64:
8062 if (ST.hasVMulU64Inst()) {
8063 NewOpcode = AMDGPU::V_MUL_U64_e64;
8064 break;
8065 }
8066 // Split s_mul_u64 in 32-bit vector multiplications.
8067 splitScalarSMulU64(Worklist, Inst, MDT);
8068 Inst.eraseFromParent();
8069 return;
8070
8071 case AMDGPU::S_MUL_U64_U32_PSEUDO:
8072 case AMDGPU::S_MUL_I64_I32_PSEUDO:
8073 // This is a special case of s_mul_u64 where all the operands are either
8074 // zero extended or sign extended.
8075 splitScalarSMulPseudo(Worklist, Inst, MDT);
8076 Inst.eraseFromParent();
8077 return;
8078
8079 case AMDGPU::S_AND_B64:
8080 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
8081 Inst.eraseFromParent();
8082 return;
8083
8084 case AMDGPU::S_OR_B64:
8085 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
8086 Inst.eraseFromParent();
8087 return;
8088
8089 case AMDGPU::S_XOR_B64:
8090 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
8091 Inst.eraseFromParent();
8092 return;
8093
8094 case AMDGPU::S_NAND_B64:
8095 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
8096 Inst.eraseFromParent();
8097 return;
8098
8099 case AMDGPU::S_NOR_B64:
8100 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
8101 Inst.eraseFromParent();
8102 return;
8103
8104 case AMDGPU::S_XNOR_B64:
8105 if (ST.hasDLInsts())
8106 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
8107 else
8108 splitScalar64BitXnor(Worklist, Inst, MDT);
8109 Inst.eraseFromParent();
8110 return;
8111
8112 case AMDGPU::S_ANDN2_B64:
8113 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
8114 Inst.eraseFromParent();
8115 return;
8116
8117 case AMDGPU::S_ORN2_B64:
8118 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
8119 Inst.eraseFromParent();
8120 return;
8121
8122 case AMDGPU::S_BREV_B64:
8123 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
8124 Inst.eraseFromParent();
8125 return;
8126
8127 case AMDGPU::S_NOT_B64:
8128 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
8129 Inst.eraseFromParent();
8130 return;
8131
8132 case AMDGPU::S_BCNT1_I32_B64:
8133 splitScalar64BitBCNT(Worklist, Inst);
8134 Inst.eraseFromParent();
8135 return;
8136
8137 case AMDGPU::S_BFE_I64:
8138 splitScalar64BitBFE(Worklist, Inst);
8139 Inst.eraseFromParent();
8140 return;
8141
8142 case AMDGPU::S_FLBIT_I32_B64:
8143 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
8144 Inst.eraseFromParent();
8145 return;
8146 case AMDGPU::S_FF1_I32_B64:
8147 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
8148 Inst.eraseFromParent();
8149 return;
8150
8151 case AMDGPU::S_LSHL_B32:
8152 if (ST.hasOnlyRevVALUShifts()) {
8153 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
8154 swapOperands(Inst);
8155 }
8156 break;
8157 case AMDGPU::S_ASHR_I32:
8158 if (ST.hasOnlyRevVALUShifts()) {
8159 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
8160 swapOperands(Inst);
8161 }
8162 break;
8163 case AMDGPU::S_LSHR_B32:
8164 if (ST.hasOnlyRevVALUShifts()) {
8165 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
8166 swapOperands(Inst);
8167 }
8168 break;
8169 case AMDGPU::S_LSHL_B64:
8170 if (ST.hasOnlyRevVALUShifts()) {
8171 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
8172 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
8173 : AMDGPU::V_LSHLREV_B64_e64;
8174 swapOperands(Inst);
8175 }
8176 break;
8177 case AMDGPU::S_ASHR_I64:
8178 if (ST.hasOnlyRevVALUShifts()) {
8179 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
8180 swapOperands(Inst);
8181 }
8182 break;
8183 case AMDGPU::S_LSHR_B64:
8184 if (ST.hasOnlyRevVALUShifts()) {
8185 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
8186 swapOperands(Inst);
8187 }
8188 break;
8189
8190 case AMDGPU::S_ABS_I32:
8191 lowerScalarAbs(Worklist, Inst);
8192 Inst.eraseFromParent();
8193 return;
8194
8195 case AMDGPU::S_ABSDIFF_I32:
8196 lowerScalarAbsDiff(Worklist, Inst);
8197 Inst.eraseFromParent();
8198 return;
8199
8200 case AMDGPU::S_CBRANCH_SCC0:
8201 case AMDGPU::S_CBRANCH_SCC1: {
8202 // Clear unused bits of vcc
8203 Register CondReg = Inst.getOperand(1).getReg();
8204 bool IsSCC = CondReg == AMDGPU::SCC;
8206 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
8207 .addReg(LMC.ExecReg)
8208 .addReg(IsSCC ? LMC.VccReg : CondReg);
8209 Inst.removeOperand(1);
8210 } break;
8211
8212 case AMDGPU::S_BFE_U64:
8213 case AMDGPU::S_BFM_B64:
8214 llvm_unreachable("Moving this op to VALU not implemented");
8215
8216 case AMDGPU::S_PACK_LL_B32_B16:
8217 case AMDGPU::S_PACK_LH_B32_B16:
8218 case AMDGPU::S_PACK_HL_B32_B16:
8219 case AMDGPU::S_PACK_HH_B32_B16:
8220 movePackToVALU(Worklist, MRI, Inst);
8221 Inst.eraseFromParent();
8222 return;
8223
8224 case AMDGPU::S_XNOR_B32:
8225 lowerScalarXnor(Worklist, Inst);
8226 Inst.eraseFromParent();
8227 return;
8228
8229 case AMDGPU::S_NAND_B32:
8230 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
8231 Inst.eraseFromParent();
8232 return;
8233
8234 case AMDGPU::S_NOR_B32:
8235 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
8236 Inst.eraseFromParent();
8237 return;
8238
8239 case AMDGPU::S_ANDN2_B32:
8240 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
8241 Inst.eraseFromParent();
8242 return;
8243
8244 case AMDGPU::S_ORN2_B32:
8245 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
8246 Inst.eraseFromParent();
8247 return;
8248
8249 // TODO: remove as soon as everything is ready
8250 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8251 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8252 // can only be selected from the uniform SDNode.
8253 case AMDGPU::S_ADD_CO_PSEUDO:
8254 case AMDGPU::S_SUB_CO_PSEUDO: {
8255 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8256 ? AMDGPU::V_ADDC_U32_e64
8257 : AMDGPU::V_SUBB_U32_e64;
8258 const auto *CarryRC = RI.getWaveMaskRegClass();
8259
8260 Register CarryInReg = Inst.getOperand(4).getReg();
8261 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
8262 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
8263 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
8264 .addReg(CarryInReg);
8265 }
8266
8267 Register CarryOutReg = Inst.getOperand(1).getReg();
8268
8269 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8270 MRI.getRegClass(Inst.getOperand(0).getReg())));
8271 MachineInstr *CarryOp =
8272 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8273 .addReg(CarryOutReg, RegState::Define)
8274 .add(Inst.getOperand(2))
8275 .add(Inst.getOperand(3))
8276 .addReg(CarryInReg)
8277 .addImm(0);
8278 legalizeOperands(*CarryOp);
8279 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8280 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8281 Inst.eraseFromParent();
8282 }
8283 return;
8284 case AMDGPU::S_UADDO_PSEUDO:
8285 case AMDGPU::S_USUBO_PSEUDO: {
8286 MachineOperand &Dest0 = Inst.getOperand(0);
8287 MachineOperand &Dest1 = Inst.getOperand(1);
8288 MachineOperand &Src0 = Inst.getOperand(2);
8289 MachineOperand &Src1 = Inst.getOperand(3);
8290
8291 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8292 ? AMDGPU::V_ADD_CO_U32_e64
8293 : AMDGPU::V_SUB_CO_U32_e64;
8294 const TargetRegisterClass *NewRC =
8295 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8296 Register DestReg = MRI.createVirtualRegister(NewRC);
8297 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8298 .addReg(Dest1.getReg(), RegState::Define)
8299 .add(Src0)
8300 .add(Src1)
8301 .addImm(0); // clamp bit
8302
8303 legalizeOperands(*NewInstr, MDT);
8304 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8305 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8306 Inst.eraseFromParent();
8307 }
8308 return;
8309 case AMDGPU::S_LSHL1_ADD_U32:
8310 case AMDGPU::S_LSHL2_ADD_U32:
8311 case AMDGPU::S_LSHL3_ADD_U32:
8312 case AMDGPU::S_LSHL4_ADD_U32: {
8313 MachineOperand &Dest = Inst.getOperand(0);
8314 MachineOperand &Src0 = Inst.getOperand(1);
8315 MachineOperand &Src1 = Inst.getOperand(2);
8316 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8317 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8318 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8319 : 4);
8320
8321 const TargetRegisterClass *NewRC =
8322 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8323 Register DestReg = MRI.createVirtualRegister(NewRC);
8324 MachineInstr *NewInstr =
8325 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8326 .add(Src0)
8327 .addImm(ShiftAmt)
8328 .add(Src1);
8329
8330 legalizeOperands(*NewInstr, MDT);
8331 MRI.replaceRegWith(Dest.getReg(), DestReg);
8332 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8333 Inst.eraseFromParent();
8334 }
8335 return;
8336 case AMDGPU::S_CSELECT_B32:
8337 case AMDGPU::S_CSELECT_B64:
8338 lowerSelect(Worklist, Inst, MDT);
8339 Inst.eraseFromParent();
8340 return;
8341 case AMDGPU::S_CMP_EQ_I32:
8342 case AMDGPU::S_CMP_LG_I32:
8343 case AMDGPU::S_CMP_GT_I32:
8344 case AMDGPU::S_CMP_GE_I32:
8345 case AMDGPU::S_CMP_LT_I32:
8346 case AMDGPU::S_CMP_LE_I32:
8347 case AMDGPU::S_CMP_EQ_U32:
8348 case AMDGPU::S_CMP_LG_U32:
8349 case AMDGPU::S_CMP_GT_U32:
8350 case AMDGPU::S_CMP_GE_U32:
8351 case AMDGPU::S_CMP_LT_U32:
8352 case AMDGPU::S_CMP_LE_U32:
8353 case AMDGPU::S_CMP_EQ_U64:
8354 case AMDGPU::S_CMP_LG_U64:
8355 case AMDGPU::S_CMP_LT_F32:
8356 case AMDGPU::S_CMP_EQ_F32:
8357 case AMDGPU::S_CMP_LE_F32:
8358 case AMDGPU::S_CMP_GT_F32:
8359 case AMDGPU::S_CMP_LG_F32:
8360 case AMDGPU::S_CMP_GE_F32:
8361 case AMDGPU::S_CMP_O_F32:
8362 case AMDGPU::S_CMP_U_F32:
8363 case AMDGPU::S_CMP_NGE_F32:
8364 case AMDGPU::S_CMP_NLG_F32:
8365 case AMDGPU::S_CMP_NGT_F32:
8366 case AMDGPU::S_CMP_NLE_F32:
8367 case AMDGPU::S_CMP_NEQ_F32:
8368 case AMDGPU::S_CMP_NLT_F32: {
8369 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8370 auto NewInstr =
8371 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8372 .setMIFlags(Inst.getFlags());
8373 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8374 0) {
8375 NewInstr
8376 .addImm(0) // src0_modifiers
8377 .add(Inst.getOperand(0)) // src0
8378 .addImm(0) // src1_modifiers
8379 .add(Inst.getOperand(1)) // src1
8380 .addImm(0); // clamp
8381 } else {
8382 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8383 }
8384 legalizeOperands(*NewInstr, MDT);
8385 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8386 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8387 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8388 Inst.eraseFromParent();
8389 return;
8390 }
8391 case AMDGPU::S_CMP_LT_F16:
8392 case AMDGPU::S_CMP_EQ_F16:
8393 case AMDGPU::S_CMP_LE_F16:
8394 case AMDGPU::S_CMP_GT_F16:
8395 case AMDGPU::S_CMP_LG_F16:
8396 case AMDGPU::S_CMP_GE_F16:
8397 case AMDGPU::S_CMP_O_F16:
8398 case AMDGPU::S_CMP_U_F16:
8399 case AMDGPU::S_CMP_NGE_F16:
8400 case AMDGPU::S_CMP_NLG_F16:
8401 case AMDGPU::S_CMP_NGT_F16:
8402 case AMDGPU::S_CMP_NLE_F16:
8403 case AMDGPU::S_CMP_NEQ_F16:
8404 case AMDGPU::S_CMP_NLT_F16: {
8405 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8406 auto NewInstr =
8407 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8408 .setMIFlags(Inst.getFlags());
8409 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8410 NewInstr
8411 .addImm(0) // src0_modifiers
8412 .add(Inst.getOperand(0)) // src0
8413 .addImm(0) // src1_modifiers
8414 .add(Inst.getOperand(1)) // src1
8415 .addImm(0); // clamp
8416 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8417 NewInstr.addImm(0); // op_sel0
8418 } else {
8419 NewInstr
8420 .add(Inst.getOperand(0))
8421 .add(Inst.getOperand(1));
8422 }
8423 legalizeOperandsVALUt16(*NewInstr, MRI);
8424 legalizeOperands(*NewInstr, MDT);
8425 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8426 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8427 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8428 Inst.eraseFromParent();
8429 return;
8430 }
8431 case AMDGPU::S_CVT_HI_F32_F16: {
8432 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8433 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8434 if (ST.useRealTrue16Insts()) {
8435 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8436 .add(Inst.getOperand(1));
8437 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8438 .addImm(0) // src0_modifiers
8439 .addReg(TmpReg, {}, AMDGPU::hi16)
8440 .addImm(0) // clamp
8441 .addImm(0) // omod
8442 .addImm(0); // op_sel0
8443 } else {
8444 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8445 .addImm(16)
8446 .add(Inst.getOperand(1));
8447 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8448 .addImm(0) // src0_modifiers
8449 .addReg(TmpReg)
8450 .addImm(0) // clamp
8451 .addImm(0); // omod
8452 }
8453
8454 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8455 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8456 Inst.eraseFromParent();
8457 return;
8458 }
8459 case AMDGPU::S_MINIMUM_F32:
8460 case AMDGPU::S_MAXIMUM_F32: {
8461 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8462 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8463 .addImm(0) // src0_modifiers
8464 .add(Inst.getOperand(1))
8465 .addImm(0) // src1_modifiers
8466 .add(Inst.getOperand(2))
8467 .addImm(0) // clamp
8468 .addImm(0); // omod
8469 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8470
8471 legalizeOperands(*NewInstr, MDT);
8472 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8473 Inst.eraseFromParent();
8474 return;
8475 }
8476 case AMDGPU::S_MINIMUM_F16:
8477 case AMDGPU::S_MAXIMUM_F16: {
8478 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8479 ? &AMDGPU::VGPR_16RegClass
8480 : &AMDGPU::VGPR_32RegClass);
8481 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8482 .addImm(0) // src0_modifiers
8483 .add(Inst.getOperand(1))
8484 .addImm(0) // src1_modifiers
8485 .add(Inst.getOperand(2))
8486 .addImm(0) // clamp
8487 .addImm(0) // omod
8488 .addImm(0); // opsel0
8489 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8490 legalizeOperandsVALUt16(*NewInstr, MRI);
8491 legalizeOperands(*NewInstr, MDT);
8492 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8493 Inst.eraseFromParent();
8494 return;
8495 }
8496 case AMDGPU::V_S_EXP_F16_e64:
8497 case AMDGPU::V_S_LOG_F16_e64:
8498 case AMDGPU::V_S_RCP_F16_e64:
8499 case AMDGPU::V_S_RSQ_F16_e64:
8500 case AMDGPU::V_S_SQRT_F16_e64: {
8501 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8502 ? &AMDGPU::VGPR_16RegClass
8503 : &AMDGPU::VGPR_32RegClass);
8504 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8505 .add(Inst.getOperand(1)) // src0_modifiers
8506 .add(Inst.getOperand(2))
8507 .add(Inst.getOperand(3)) // clamp
8508 .add(Inst.getOperand(4)) // omod
8509 .setMIFlags(Inst.getFlags());
8510 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8511 NewInstr.addImm(0); // opsel0
8512 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8513 legalizeOperandsVALUt16(*NewInstr, MRI);
8514 legalizeOperands(*NewInstr, MDT);
8515 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8516 Inst.eraseFromParent();
8517 return;
8518 }
8519 }
8520
8521 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8522 // We cannot move this instruction to the VALU, so we should try to
8523 // legalize its operands instead.
8524 legalizeOperands(Inst, MDT);
8525 return;
8526 }
8527 // Handle converting generic instructions like COPY-to-SGPR into
8528 // COPY-to-VGPR.
8529 if (NewOpcode == Opcode) {
8530 Register DstReg = Inst.getOperand(0).getReg();
8531 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8532
8533 if (Inst.isCopy() && DstReg.isPhysical() &&
8534 Inst.getOperand(1).getReg().isVirtual()) {
8535 handleCopyToPhysHelper(Worklist, DstReg, Inst, MRI, WaterFalls,
8536 V2SPhyCopiesToErase);
8537 return;
8538 }
8539
8540 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8541 Register NewDstReg = Inst.getOperand(1).getReg();
8542 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8543 if (const TargetRegisterClass *CommonRC =
8544 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8545 // Instead of creating a copy where src and dst are the same register
8546 // class, we just replace all uses of dst with src. These kinds of
8547 // copies interfere with the heuristics MachineSink uses to decide
8548 // whether or not to split a critical edge. Since the pass assumes
8549 // that copies will end up as machine instructions and not be
8550 // eliminated.
8551 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8552 MRI.replaceRegWith(DstReg, NewDstReg);
8553 MRI.clearKillFlags(NewDstReg);
8554 Inst.getOperand(0).setReg(DstReg);
8555
8556 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8557 llvm_unreachable("failed to constrain register");
8558
8559 Inst.eraseFromParent();
8560
8561 for (MachineOperand &UseMO :
8562 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8563 MachineInstr &UseMI = *UseMO.getParent();
8564
8565 // Legalize t16 operands since replaceReg is called after
8566 // addUsersToVALU.
8568
8569 unsigned OpIdx = UseMI.getOperandNo(&UseMO);
8570 if (const TargetRegisterClass *OpRC =
8571 getRegClass(UseMI.getDesc(), OpIdx))
8572 MRI.constrainRegClass(NewDstReg, OpRC);
8573 }
8574
8575 return;
8576 }
8577 }
8578
8579 // If this is a v2s copy between 16bit and 32bit reg,
8580 // replace vgpr copy to reg_sequence/extract_subreg
8581 // This can be remove after we have sgpr16 in place
8582 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8583 Inst.getOperand(1).getReg().isVirtual() &&
8584 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8585 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8586 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8587 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8588 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8589 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8590 get(AMDGPU::IMPLICIT_DEF), Undef);
8591 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8592 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8593 .addReg(Inst.getOperand(1).getReg())
8594 .addImm(AMDGPU::lo16)
8595 .addReg(Undef)
8596 .addImm(AMDGPU::hi16);
8597 Inst.eraseFromParent();
8598 MRI.replaceRegWith(DstReg, NewDstReg);
8599 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8600 return;
8601 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8602 AMDGPU::lo16)) {
8603 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8604 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8605 MRI.replaceRegWith(DstReg, NewDstReg);
8606 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8607 return;
8608 }
8609 }
8610
8611 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8612 MRI.replaceRegWith(DstReg, NewDstReg);
8613 legalizeOperands(Inst, MDT);
8614 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8615 return;
8616 }
8617
8618 // Use the new VALU Opcode.
8619 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8620 .setMIFlags(Inst.getFlags());
8621 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8622 // Intersperse VOP3 modifiers among the SALU operands.
8623 NewInstr->addOperand(Inst.getOperand(0));
8624 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8625 AMDGPU::OpName::src0_modifiers) >= 0)
8626 NewInstr.addImm(0);
8627 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8628 const MachineOperand &Src = Inst.getOperand(1);
8629 NewInstr->addOperand(Src);
8630 }
8631
8632 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8633 // We are converting these to a BFE, so we need to add the missing
8634 // operands for the size and offset.
8635 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8636 NewInstr.addImm(0);
8637 NewInstr.addImm(Size);
8638 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8639 // The VALU version adds the second operand to the result, so insert an
8640 // extra 0 operand.
8641 NewInstr.addImm(0);
8642 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8643 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8644 // If we need to move this to VGPRs, we need to unpack the second
8645 // operand back into the 2 separate ones for bit offset and width.
8646 assert(OffsetWidthOp.isImm() &&
8647 "Scalar BFE is only implemented for constant width and offset");
8648 uint32_t Imm = OffsetWidthOp.getImm();
8649
8650 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8651 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8652 NewInstr.addImm(Offset);
8653 NewInstr.addImm(BitWidth);
8654 } else {
8655 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8656 AMDGPU::OpName::src1_modifiers) >= 0)
8657 NewInstr.addImm(0);
8658 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8659 NewInstr->addOperand(Inst.getOperand(2));
8660 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8661 AMDGPU::OpName::src2_modifiers) >= 0)
8662 NewInstr.addImm(0);
8663 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8664 NewInstr->addOperand(Inst.getOperand(3));
8665 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8666 NewInstr.addImm(0);
8667 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8668 NewInstr.addImm(0);
8669 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8670 NewInstr.addImm(0);
8671 }
8672 } else {
8673 // Just copy the SALU operands.
8674 for (const MachineOperand &Op : Inst.explicit_operands())
8675 NewInstr->addOperand(Op);
8676 }
8677
8678 // Remove any references to SCC. Vector instructions can't read from it, and
8679 // We're just about to add the implicit use / defs of VCC, and we don't want
8680 // both.
8681 for (MachineOperand &Op : Inst.implicit_operands()) {
8682 if (Op.getReg() == AMDGPU::SCC) {
8683 // Only propagate through live-def of SCC.
8684 if (Op.isDef() && !Op.isDead())
8685 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8686 if (Op.isUse())
8687 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8688 }
8689 }
8690 Inst.eraseFromParent();
8691 Register NewDstReg;
8692 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8693 Register DstReg = NewInstr->getOperand(0).getReg();
8694 assert(DstReg.isVirtual());
8695 // Update the destination register class.
8696 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8697 assert(NewDstRC);
8698 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8699 MRI.replaceRegWith(DstReg, NewDstReg);
8700 }
8701 fixImplicitOperands(*NewInstr);
8702
8703 legalizeOperandsVALUt16(*NewInstr, MRI);
8704
8705 // Legalize the operands
8706 legalizeOperands(*NewInstr, MDT);
8707 if (NewDstReg)
8708 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8709}
8710
8711// Add/sub require special handling to deal with carry outs.
8712std::pair<bool, MachineBasicBlock *>
8713SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8714 MachineDominatorTree *MDT) const {
8715 if (ST.hasAddNoCarryInsts()) {
8716 // Assume there is no user of scc since we don't select this in that case.
8717 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8718 // is used.
8719
8720 MachineBasicBlock &MBB = *Inst.getParent();
8721 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8722
8723 Register OldDstReg = Inst.getOperand(0).getReg();
8724 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8725
8726 unsigned Opc = Inst.getOpcode();
8727 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8728
8729 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8730 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8731
8732 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8733 Inst.removeOperand(3);
8734
8735 Inst.setDesc(get(NewOpc));
8736 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8737 Inst.addImplicitDefUseOperands(*MBB.getParent());
8738 MRI.replaceRegWith(OldDstReg, ResultReg);
8739 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8740
8741 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8742 return std::pair(true, NewBB);
8743 }
8744
8745 return std::pair(false, nullptr);
8746}
8747
8748void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8749 MachineDominatorTree *MDT) const {
8750
8751 MachineBasicBlock &MBB = *Inst.getParent();
8752 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8753 MachineBasicBlock::iterator MII = Inst;
8754 const DebugLoc &DL = Inst.getDebugLoc();
8755
8756 MachineOperand &Dest = Inst.getOperand(0);
8757 MachineOperand &Src0 = Inst.getOperand(1);
8758 MachineOperand &Src1 = Inst.getOperand(2);
8759 MachineOperand &Cond = Inst.getOperand(3);
8760
8761 Register CondReg = Cond.getReg();
8762 bool IsSCC = (CondReg == AMDGPU::SCC);
8763
8764 // If this is a trivial select where the condition is effectively not SCC
8765 // (CondReg is a source of copy to SCC), then the select is semantically
8766 // equivalent to copying CondReg. Hence, there is no need to create
8767 // V_CNDMASK, we can just use that and bail out.
8768 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8769 (Src1.getImm() == 0)) {
8770 MRI.replaceRegWith(Dest.getReg(), CondReg);
8771 return;
8772 }
8773
8774 Register NewCondReg = CondReg;
8775 if (IsSCC) {
8776 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8777 NewCondReg = MRI.createVirtualRegister(TC);
8778
8779 // Now look for the closest SCC def if it is a copy
8780 // replacing the CondReg with the COPY source register
8781 bool CopyFound = false;
8782 for (MachineInstr &CandI :
8784 Inst.getParent()->rend())) {
8785 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8786 -1) {
8787 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8788 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8789 .addReg(CandI.getOperand(1).getReg());
8790 CopyFound = true;
8791 }
8792 break;
8793 }
8794 }
8795 if (!CopyFound) {
8796 // SCC def is not a copy
8797 // Insert a trivial select instead of creating a copy, because a copy from
8798 // SCC would semantically mean just copying a single bit, but we may need
8799 // the result to be a vector condition mask that needs preserving.
8800 unsigned Opcode =
8801 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8802 auto NewSelect =
8803 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8804 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8805 }
8806 }
8807
8808 Register NewDestReg = MRI.createVirtualRegister(
8809 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8810 MachineInstr *NewInst;
8811 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8812 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8813 .addImm(0)
8814 .add(Src1) // False
8815 .addImm(0)
8816 .add(Src0) // True
8817 .addReg(NewCondReg);
8818 } else {
8819 NewInst =
8820 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8821 .add(Src1) // False
8822 .add(Src0) // True
8823 .addReg(NewCondReg);
8824 }
8825 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8826 legalizeOperands(*NewInst, MDT);
8827 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8828}
8829
8830void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8831 MachineInstr &Inst) const {
8832 MachineBasicBlock &MBB = *Inst.getParent();
8833 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8834 MachineBasicBlock::iterator MII = Inst;
8835 const DebugLoc &DL = Inst.getDebugLoc();
8836
8837 MachineOperand &Dest = Inst.getOperand(0);
8838 MachineOperand &Src = Inst.getOperand(1);
8839 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8840 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8841
8842 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8843 : AMDGPU::V_SUB_CO_U32_e32;
8844
8845 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8846 .addImm(0)
8847 .addReg(Src.getReg());
8848
8849 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8850 .addReg(Src.getReg())
8851 .addReg(TmpReg);
8852
8853 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8854 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8855}
8856
8857void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8858 MachineInstr &Inst) const {
8859 MachineBasicBlock &MBB = *Inst.getParent();
8860 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8861 MachineBasicBlock::iterator MII = Inst;
8862 const DebugLoc &DL = Inst.getDebugLoc();
8863
8864 MachineOperand &Dest = Inst.getOperand(0);
8865 MachineOperand &Src1 = Inst.getOperand(1);
8866 MachineOperand &Src2 = Inst.getOperand(2);
8867 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8868 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8869 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8870
8871 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8872 : AMDGPU::V_SUB_CO_U32_e32;
8873
8874 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8875 .addReg(Src1.getReg())
8876 .addReg(Src2.getReg());
8877
8878 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8879
8880 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8881 .addReg(SubResultReg)
8882 .addReg(TmpReg);
8883
8884 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8885 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8886}
8887
8888void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8889 MachineInstr &Inst) const {
8890 MachineBasicBlock &MBB = *Inst.getParent();
8891 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8892 MachineBasicBlock::iterator MII = Inst;
8893 const DebugLoc &DL = Inst.getDebugLoc();
8894
8895 MachineOperand &Dest = Inst.getOperand(0);
8896 MachineOperand &Src0 = Inst.getOperand(1);
8897 MachineOperand &Src1 = Inst.getOperand(2);
8898
8899 if (ST.hasDLInsts()) {
8900 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8901 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8902 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8903
8904 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8905 .add(Src0)
8906 .add(Src1);
8907
8908 MRI.replaceRegWith(Dest.getReg(), NewDest);
8909 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8910 } else {
8911 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8912 // invert either source and then perform the XOR. If either source is a
8913 // scalar register, then we can leave the inversion on the scalar unit to
8914 // achieve a better distribution of scalar and vector instructions.
8915 bool Src0IsSGPR = Src0.isReg() &&
8916 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8917 bool Src1IsSGPR = Src1.isReg() &&
8918 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8919 MachineInstr *Xor;
8920 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8921 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8922
8923 // Build a pair of scalar instructions and add them to the work list.
8924 // The next iteration over the work list will lower these to the vector
8925 // unit as necessary.
8926 if (Src0IsSGPR) {
8927 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8928 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8929 .addReg(Temp)
8930 .add(Src1);
8931 } else if (Src1IsSGPR) {
8932 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8933 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8934 .add(Src0)
8935 .addReg(Temp);
8936 } else {
8937 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8938 .add(Src0)
8939 .add(Src1);
8940 MachineInstr *Not =
8941 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8942 Worklist.insert(Not);
8943 }
8944
8945 MRI.replaceRegWith(Dest.getReg(), NewDest);
8946
8947 Worklist.insert(Xor);
8948
8949 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8950 }
8951}
8952
8953void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8954 MachineInstr &Inst,
8955 unsigned Opcode) const {
8956 MachineBasicBlock &MBB = *Inst.getParent();
8957 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8958 MachineBasicBlock::iterator MII = Inst;
8959 const DebugLoc &DL = Inst.getDebugLoc();
8960
8961 MachineOperand &Dest = Inst.getOperand(0);
8962 MachineOperand &Src0 = Inst.getOperand(1);
8963 MachineOperand &Src1 = Inst.getOperand(2);
8964
8965 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8966 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8967
8968 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8969 .add(Src0)
8970 .add(Src1);
8971
8972 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8973 .addReg(Interm);
8974
8975 Worklist.insert(&Op);
8976 Worklist.insert(&Not);
8977
8978 MRI.replaceRegWith(Dest.getReg(), NewDest);
8979 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8980}
8981
8982void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8983 MachineInstr &Inst,
8984 unsigned Opcode) const {
8985 MachineBasicBlock &MBB = *Inst.getParent();
8986 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8987 MachineBasicBlock::iterator MII = Inst;
8988 const DebugLoc &DL = Inst.getDebugLoc();
8989
8990 MachineOperand &Dest = Inst.getOperand(0);
8991 MachineOperand &Src0 = Inst.getOperand(1);
8992 MachineOperand &Src1 = Inst.getOperand(2);
8993
8994 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8995 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8996
8997 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8998 .add(Src1);
8999
9000 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
9001 .add(Src0)
9002 .addReg(Interm);
9003
9004 Worklist.insert(&Not);
9005 Worklist.insert(&Op);
9006
9007 MRI.replaceRegWith(Dest.getReg(), NewDest);
9008 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
9009}
9010
9011void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
9012 MachineInstr &Inst, unsigned Opcode,
9013 bool Swap) const {
9014 MachineBasicBlock &MBB = *Inst.getParent();
9015 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9016
9017 MachineOperand &Dest = Inst.getOperand(0);
9018 MachineOperand &Src0 = Inst.getOperand(1);
9019 const DebugLoc &DL = Inst.getDebugLoc();
9020
9021 MachineBasicBlock::iterator MII = Inst;
9022
9023 const MCInstrDesc &InstDesc = get(Opcode);
9024 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9025 MRI.getRegClass(Src0.getReg()) :
9026 &AMDGPU::SGPR_32RegClass;
9027
9028 const TargetRegisterClass *Src0SubRC =
9029 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9030
9031 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9032 AMDGPU::sub0, Src0SubRC);
9033
9034 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9035 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9036 const TargetRegisterClass *NewDestSubRC =
9037 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9038
9039 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9040 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
9041
9042 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9043 AMDGPU::sub1, Src0SubRC);
9044
9045 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9046 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
9047
9048 if (Swap)
9049 std::swap(DestSub0, DestSub1);
9050
9051 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9052 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9053 .addReg(DestSub0)
9054 .addImm(AMDGPU::sub0)
9055 .addReg(DestSub1)
9056 .addImm(AMDGPU::sub1);
9057
9058 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9059
9060 Worklist.insert(&LoHalf);
9061 Worklist.insert(&HiHalf);
9062
9063 // We don't need to legalizeOperands here because for a single operand, src0
9064 // will support any kind of input.
9065
9066 // Move all users of this moved value.
9067 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9068}
9069
9070// There is not a vector equivalent of s_mul_u64. For this reason, we need to
9071// split the s_mul_u64 in 32-bit vector multiplications.
9072void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
9073 MachineInstr &Inst,
9074 MachineDominatorTree *MDT) const {
9075 MachineBasicBlock &MBB = *Inst.getParent();
9076 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9077
9078 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9079 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9080 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9081
9082 MachineOperand &Dest = Inst.getOperand(0);
9083 MachineOperand &Src0 = Inst.getOperand(1);
9084 MachineOperand &Src1 = Inst.getOperand(2);
9085 const DebugLoc &DL = Inst.getDebugLoc();
9086 MachineBasicBlock::iterator MII = Inst;
9087
9088 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9089 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9090 const TargetRegisterClass *Src0SubRC =
9091 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9092 if (RI.isSGPRClass(Src0SubRC))
9093 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9094 const TargetRegisterClass *Src1SubRC =
9095 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9096 if (RI.isSGPRClass(Src1SubRC))
9097 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9098
9099 // First, we extract the low 32-bit and high 32-bit values from each of the
9100 // operands.
9101 MachineOperand Op0L =
9102 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9103 MachineOperand Op1L =
9104 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9105 MachineOperand Op0H =
9106 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
9107 MachineOperand Op1H =
9108 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
9109
9110 // The multilication is done as follows:
9111 //
9112 // Op1H Op1L
9113 // * Op0H Op0L
9114 // --------------------
9115 // Op1H*Op0L Op1L*Op0L
9116 // + Op1H*Op0H Op1L*Op0H
9117 // -----------------------------------------
9118 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
9119 //
9120 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
9121 // value and that would overflow.
9122 // The low 32-bit value is Op1L*Op0L.
9123 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
9124
9125 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9126 MachineInstr *Op1L_Op0H =
9127 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
9128 .add(Op1L)
9129 .add(Op0H);
9130
9131 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9132 MachineInstr *Op1H_Op0L =
9133 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
9134 .add(Op1H)
9135 .add(Op0L);
9136
9137 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9138 MachineInstr *Carry =
9139 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
9140 .add(Op1L)
9141 .add(Op0L);
9142
9143 MachineInstr *LoHalf =
9144 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9145 .add(Op1L)
9146 .add(Op0L);
9147
9148 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9149 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
9150 .addReg(Op1L_Op0H_Reg)
9151 .addReg(Op1H_Op0L_Reg);
9152
9153 MachineInstr *HiHalf =
9154 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
9155 .addReg(AddReg)
9156 .addReg(CarryReg);
9157
9158 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9159 .addReg(DestSub0)
9160 .addImm(AMDGPU::sub0)
9161 .addReg(DestSub1)
9162 .addImm(AMDGPU::sub1);
9163
9164 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9165
9166 // Try to legalize the operands in case we need to swap the order to keep it
9167 // valid.
9168 legalizeOperands(*Op1L_Op0H, MDT);
9169 legalizeOperands(*Op1H_Op0L, MDT);
9170 legalizeOperands(*Carry, MDT);
9171 legalizeOperands(*LoHalf, MDT);
9172 legalizeOperands(*Add, MDT);
9173 legalizeOperands(*HiHalf, MDT);
9174
9175 // Move all users of this moved value.
9176 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9177}
9178
9179// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
9180// multiplications.
9181void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
9182 MachineInstr &Inst,
9183 MachineDominatorTree *MDT) const {
9184 MachineBasicBlock &MBB = *Inst.getParent();
9185 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9186
9187 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9188 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9189 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9190
9191 MachineOperand &Dest = Inst.getOperand(0);
9192 MachineOperand &Src0 = Inst.getOperand(1);
9193 MachineOperand &Src1 = Inst.getOperand(2);
9194 const DebugLoc &DL = Inst.getDebugLoc();
9195 MachineBasicBlock::iterator MII = Inst;
9196
9197 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9198 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9199 const TargetRegisterClass *Src0SubRC =
9200 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9201 if (RI.isSGPRClass(Src0SubRC))
9202 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9203 const TargetRegisterClass *Src1SubRC =
9204 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9205 if (RI.isSGPRClass(Src1SubRC))
9206 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9207
9208 // First, we extract the low 32-bit and high 32-bit values from each of the
9209 // operands.
9210 MachineOperand Op0L =
9211 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9212 MachineOperand Op1L =
9213 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9214
9215 unsigned Opc = Inst.getOpcode();
9216 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
9217 ? AMDGPU::V_MUL_HI_U32_e64
9218 : AMDGPU::V_MUL_HI_I32_e64;
9219 MachineInstr *HiHalf =
9220 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
9221
9222 MachineInstr *LoHalf =
9223 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9224 .add(Op1L)
9225 .add(Op0L);
9226
9227 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9228 .addReg(DestSub0)
9229 .addImm(AMDGPU::sub0)
9230 .addReg(DestSub1)
9231 .addImm(AMDGPU::sub1);
9232
9233 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9234
9235 // Try to legalize the operands in case we need to swap the order to keep it
9236 // valid.
9237 legalizeOperands(*HiHalf, MDT);
9238 legalizeOperands(*LoHalf, MDT);
9239
9240 // Move all users of this moved value.
9241 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9242}
9243
9244void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9245 MachineInstr &Inst, unsigned Opcode,
9246 MachineDominatorTree *MDT) const {
9247 MachineBasicBlock &MBB = *Inst.getParent();
9248 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9249
9250 MachineOperand &Dest = Inst.getOperand(0);
9251 MachineOperand &Src0 = Inst.getOperand(1);
9252 MachineOperand &Src1 = Inst.getOperand(2);
9253 const DebugLoc &DL = Inst.getDebugLoc();
9254
9255 MachineBasicBlock::iterator MII = Inst;
9256
9257 const MCInstrDesc &InstDesc = get(Opcode);
9258 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9259 MRI.getRegClass(Src0.getReg()) :
9260 &AMDGPU::SGPR_32RegClass;
9261
9262 const TargetRegisterClass *Src0SubRC =
9263 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9264 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9265 MRI.getRegClass(Src1.getReg()) :
9266 &AMDGPU::SGPR_32RegClass;
9267
9268 const TargetRegisterClass *Src1SubRC =
9269 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9270
9271 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9272 AMDGPU::sub0, Src0SubRC);
9273 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9274 AMDGPU::sub0, Src1SubRC);
9275 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9276 AMDGPU::sub1, Src0SubRC);
9277 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9278 AMDGPU::sub1, Src1SubRC);
9279
9280 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9281 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9282 const TargetRegisterClass *NewDestSubRC =
9283 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9284
9285 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9286 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9287 .add(SrcReg0Sub0)
9288 .add(SrcReg1Sub0);
9289
9290 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9291 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9292 .add(SrcReg0Sub1)
9293 .add(SrcReg1Sub1);
9294
9295 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9296 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9297 .addReg(DestSub0)
9298 .addImm(AMDGPU::sub0)
9299 .addReg(DestSub1)
9300 .addImm(AMDGPU::sub1);
9301
9302 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9303
9304 Worklist.insert(&LoHalf);
9305 Worklist.insert(&HiHalf);
9306
9307 // Move all users of this moved value.
9308 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9309}
9310
9311void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9312 MachineInstr &Inst,
9313 MachineDominatorTree *MDT) const {
9314 MachineBasicBlock &MBB = *Inst.getParent();
9315 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9316
9317 MachineOperand &Dest = Inst.getOperand(0);
9318 MachineOperand &Src0 = Inst.getOperand(1);
9319 MachineOperand &Src1 = Inst.getOperand(2);
9320 const DebugLoc &DL = Inst.getDebugLoc();
9321
9322 MachineBasicBlock::iterator MII = Inst;
9323
9324 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9325
9326 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9327
9328 MachineOperand* Op0;
9329 MachineOperand* Op1;
9330
9331 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9332 Op0 = &Src0;
9333 Op1 = &Src1;
9334 } else {
9335 Op0 = &Src1;
9336 Op1 = &Src0;
9337 }
9338
9339 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9340 .add(*Op0);
9341
9342 Register NewDest = MRI.createVirtualRegister(DestRC);
9343
9344 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9345 .addReg(Interm)
9346 .add(*Op1);
9347
9348 MRI.replaceRegWith(Dest.getReg(), NewDest);
9349
9350 Worklist.insert(&Xor);
9351}
9352
9353void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9354 MachineInstr &Inst) const {
9355 MachineBasicBlock &MBB = *Inst.getParent();
9356 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9357
9358 MachineBasicBlock::iterator MII = Inst;
9359 const DebugLoc &DL = Inst.getDebugLoc();
9360
9361 MachineOperand &Dest = Inst.getOperand(0);
9362 MachineOperand &Src = Inst.getOperand(1);
9363
9364 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9365 const TargetRegisterClass *SrcRC = Src.isReg() ?
9366 MRI.getRegClass(Src.getReg()) :
9367 &AMDGPU::SGPR_32RegClass;
9368
9369 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9370 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9371
9372 const TargetRegisterClass *SrcSubRC =
9373 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9374
9375 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9376 AMDGPU::sub0, SrcSubRC);
9377 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9378 AMDGPU::sub1, SrcSubRC);
9379
9380 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9381
9382 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9383
9384 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9385
9386 // We don't need to legalize operands here. src0 for either instruction can be
9387 // an SGPR, and the second input is unused or determined here.
9388 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9389}
9390
9391void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9392 MachineInstr &Inst) const {
9393 MachineBasicBlock &MBB = *Inst.getParent();
9394 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9395 MachineBasicBlock::iterator MII = Inst;
9396 const DebugLoc &DL = Inst.getDebugLoc();
9397
9398 MachineOperand &Dest = Inst.getOperand(0);
9399 uint32_t Imm = Inst.getOperand(2).getImm();
9400 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9401 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9402
9403 (void) Offset;
9404
9405 // Only sext_inreg cases handled.
9406 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9407 Offset == 0 && "Not implemented");
9408
9409 if (BitWidth < 32) {
9410 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9411 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9412 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9413
9414 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9415 .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
9416 .addImm(0)
9417 .addImm(BitWidth);
9418
9419 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9420 .addImm(31)
9421 .addReg(MidRegLo);
9422
9423 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9424 .addReg(MidRegLo)
9425 .addImm(AMDGPU::sub0)
9426 .addReg(MidRegHi)
9427 .addImm(AMDGPU::sub1);
9428
9429 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9430 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9431 return;
9432 }
9433
9434 MachineOperand &Src = Inst.getOperand(1);
9435 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9436 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9437
9438 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9439 .addImm(31)
9440 .addReg(Src.getReg(), {}, AMDGPU::sub0);
9441
9442 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9443 .addReg(Src.getReg(), {}, AMDGPU::sub0)
9444 .addImm(AMDGPU::sub0)
9445 .addReg(TmpReg)
9446 .addImm(AMDGPU::sub1);
9447
9448 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9449 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9450}
9451
9452void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9453 MachineInstr &Inst, unsigned Opcode,
9454 MachineDominatorTree *MDT) const {
9455 // (S_FLBIT_I32_B64 hi:lo) ->
9456 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9457 // (S_FF1_I32_B64 hi:lo) ->
9458 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9459
9460 MachineBasicBlock &MBB = *Inst.getParent();
9461 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9462 MachineBasicBlock::iterator MII = Inst;
9463 const DebugLoc &DL = Inst.getDebugLoc();
9464
9465 MachineOperand &Dest = Inst.getOperand(0);
9466 MachineOperand &Src = Inst.getOperand(1);
9467
9468 const MCInstrDesc &InstDesc = get(Opcode);
9469
9470 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9471 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9472 : AMDGPU::V_ADD_CO_U32_e32;
9473
9474 const TargetRegisterClass *SrcRC =
9475 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9476 const TargetRegisterClass *SrcSubRC =
9477 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9478
9479 MachineOperand SrcRegSub0 =
9480 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9481 MachineOperand SrcRegSub1 =
9482 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9483
9484 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9485 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9486 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9487 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9488
9489 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9490
9491 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9492
9493 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9494 .addReg(IsCtlz ? MidReg1 : MidReg2)
9495 .addImm(32)
9496 .addImm(1); // enable clamp
9497
9498 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9499 .addReg(MidReg3)
9500 .addReg(IsCtlz ? MidReg2 : MidReg1);
9501
9502 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9503
9504 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9505}
9506
9507void SIInstrInfo::addUsersToMoveToVALUWorklist(
9508 Register DstReg, MachineRegisterInfo &MRI,
9509 SIInstrWorklist &Worklist) const {
9510 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9511 MachineInstr &UseMI = *MO.getParent();
9512
9513 unsigned OpNo = 0;
9514
9515 switch (UseMI.getOpcode()) {
9516 case AMDGPU::COPY:
9517 case AMDGPU::WQM:
9518 case AMDGPU::SOFT_WQM:
9519 case AMDGPU::STRICT_WWM:
9520 case AMDGPU::STRICT_WQM:
9521 case AMDGPU::REG_SEQUENCE:
9522 case AMDGPU::PHI:
9523 case AMDGPU::INSERT_SUBREG:
9524 break;
9525 default:
9526 OpNo = MO.getOperandNo();
9527 break;
9528 }
9529
9530 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9531 MRI.constrainRegClass(DstReg, OpRC);
9532
9533 if (!RI.hasVectorRegisters(OpRC))
9534 Worklist.insert(&UseMI);
9535 else
9536 // Legalization could change user list.
9537 legalizeOperandsVALUt16(UseMI, OpNo, MRI);
9538 }
9539}
9540
9541void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9543 MachineInstr &Inst) const {
9544 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9545 MachineBasicBlock *MBB = Inst.getParent();
9546 MachineOperand &Src0 = Inst.getOperand(1);
9547 MachineOperand &Src1 = Inst.getOperand(2);
9548 const DebugLoc &DL = Inst.getDebugLoc();
9549
9550 if (ST.useRealTrue16Insts()) {
9551 Register SrcReg0, SrcReg1;
9552 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9553 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9554 BuildMI(*MBB, Inst, DL,
9555 get(Src0.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg0)
9556 .add(Src0);
9557 } else {
9558 SrcReg0 = Src0.getReg();
9559 }
9560
9561 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9562 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9563 BuildMI(*MBB, Inst, DL,
9564 get(Src1.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg1)
9565 .add(Src1);
9566 } else {
9567 SrcReg1 = Src1.getReg();
9568 }
9569
9570 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9571 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9572
9573 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9574 switch (Inst.getOpcode()) {
9575 case AMDGPU::S_PACK_LL_B32_B16:
9576 NewMI
9577 .addReg(SrcReg0, {},
9578 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9579 .addImm(AMDGPU::lo16)
9580 .addReg(SrcReg1, {},
9581 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9582 .addImm(AMDGPU::hi16);
9583 break;
9584 case AMDGPU::S_PACK_LH_B32_B16:
9585 NewMI
9586 .addReg(SrcReg0, {},
9587 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9588 .addImm(AMDGPU::lo16)
9589 .addReg(SrcReg1, {}, AMDGPU::hi16)
9590 .addImm(AMDGPU::hi16);
9591 break;
9592 case AMDGPU::S_PACK_HL_B32_B16:
9593 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9594 .addImm(AMDGPU::lo16)
9595 .addReg(SrcReg1, {},
9596 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9597 .addImm(AMDGPU::hi16);
9598 break;
9599 case AMDGPU::S_PACK_HH_B32_B16:
9600 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9601 .addImm(AMDGPU::lo16)
9602 .addReg(SrcReg1, {}, AMDGPU::hi16)
9603 .addImm(AMDGPU::hi16);
9604 break;
9605 default:
9606 llvm_unreachable("unhandled s_pack_* instruction");
9607 }
9608
9609 MachineOperand &Dest = Inst.getOperand(0);
9610 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9611 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9612 return;
9613 }
9614
9615 switch (Inst.getOpcode()) {
9616 case AMDGPU::S_PACK_LL_B32_B16: {
9617 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9618 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9619
9620 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9621 // 0.
9622 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9623 .addImm(0xffff);
9624
9625 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9626 .addReg(ImmReg, RegState::Kill)
9627 .add(Src0);
9628
9629 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9630 .add(Src1)
9631 .addImm(16)
9632 .addReg(TmpReg, RegState::Kill);
9633 break;
9634 }
9635 case AMDGPU::S_PACK_LH_B32_B16: {
9636 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9637 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9638 .addImm(0xffff);
9639 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9640 .addReg(ImmReg, RegState::Kill)
9641 .add(Src0)
9642 .add(Src1);
9643 break;
9644 }
9645 case AMDGPU::S_PACK_HL_B32_B16: {
9646 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9647 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9648 .addImm(16)
9649 .add(Src0);
9650 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9651 .add(Src1)
9652 .addImm(16)
9653 .addReg(TmpReg, RegState::Kill);
9654 break;
9655 }
9656 case AMDGPU::S_PACK_HH_B32_B16: {
9657 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9658 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9659 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9660 .addImm(16)
9661 .add(Src0);
9662 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9663 .addImm(0xffff0000);
9664 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9665 .add(Src1)
9666 .addReg(ImmReg, RegState::Kill)
9667 .addReg(TmpReg, RegState::Kill);
9668 break;
9669 }
9670 default:
9671 llvm_unreachable("unhandled s_pack_* instruction");
9672 }
9673
9674 MachineOperand &Dest = Inst.getOperand(0);
9675 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9676 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9677}
9678
9679void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9680 MachineInstr &SCCDefInst,
9681 SIInstrWorklist &Worklist,
9682 Register NewCond) const {
9683
9684 // Ensure that def inst defines SCC, which is still live.
9685 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9686 !Op.isDead() && Op.getParent() == &SCCDefInst);
9687 SmallVector<MachineInstr *, 4> CopyToDelete;
9688 // This assumes that all the users of SCC are in the same block
9689 // as the SCC def.
9690 for (MachineInstr &MI : // Skip the def inst itself.
9691 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9692 SCCDefInst.getParent()->end())) {
9693 // Check if SCC is used first.
9694 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9695 if (SCCIdx != -1) {
9696 if (MI.isCopy()) {
9697 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9698 Register DestReg = MI.getOperand(0).getReg();
9699
9700 MRI.replaceRegWith(DestReg, NewCond);
9701 CopyToDelete.push_back(&MI);
9702 } else {
9703
9704 if (NewCond.isValid())
9705 MI.getOperand(SCCIdx).setReg(NewCond);
9706
9707 Worklist.insert(&MI);
9708 }
9709 }
9710 // Exit if we find another SCC def.
9711 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9712 break;
9713 }
9714 for (auto &Copy : CopyToDelete)
9715 Copy->eraseFromParent();
9716}
9717
9718// Instructions that use SCC may be converted to VALU instructions. When that
9719// happens, the SCC register is changed to VCC_LO. The instruction that defines
9720// SCC must be changed to an instruction that defines VCC. This function makes
9721// sure that the instruction that defines SCC is added to the moveToVALU
9722// worklist.
9723void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9724 SIInstrWorklist &Worklist) const {
9725 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9726 // then there is nothing to do because the defining instruction has been
9727 // converted to a VALU already. If SCC then that instruction needs to be
9728 // converted to a VALU.
9729 for (MachineInstr &MI :
9730 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9731 SCCUseInst->getParent()->rend())) {
9732 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9733 break;
9734 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9735 Worklist.insert(&MI);
9736 break;
9737 }
9738 }
9739}
9740
9741const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9742 const MachineInstr &Inst) const {
9743 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9744
9745 switch (Inst.getOpcode()) {
9746 // For target instructions, getOpRegClass just returns the virtual register
9747 // class associated with the operand, so we need to find an equivalent VGPR
9748 // register class in order to move the instruction to the VALU.
9749 case AMDGPU::COPY:
9750 case AMDGPU::PHI:
9751 case AMDGPU::REG_SEQUENCE:
9752 case AMDGPU::INSERT_SUBREG:
9753 case AMDGPU::WQM:
9754 case AMDGPU::SOFT_WQM:
9755 case AMDGPU::STRICT_WWM:
9756 case AMDGPU::STRICT_WQM: {
9757 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9758 if (RI.isAGPRClass(SrcRC)) {
9759 if (RI.isAGPRClass(NewDstRC))
9760 return nullptr;
9761
9762 switch (Inst.getOpcode()) {
9763 case AMDGPU::PHI:
9764 case AMDGPU::REG_SEQUENCE:
9765 case AMDGPU::INSERT_SUBREG:
9766 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9767 break;
9768 default:
9769 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9770 }
9771
9772 if (!NewDstRC)
9773 return nullptr;
9774 } else {
9775 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9776 return nullptr;
9777
9778 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9779 if (!NewDstRC)
9780 return nullptr;
9781 }
9782
9783 return NewDstRC;
9784 }
9785 default:
9786 return NewDstRC;
9787 }
9788}
9789
9790// Find the one SGPR operand we are allowed to use.
9791Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9792 int OpIndices[3]) const {
9793 const MCInstrDesc &Desc = MI.getDesc();
9794
9795 // Find the one SGPR operand we are allowed to use.
9796 //
9797 // First we need to consider the instruction's operand requirements before
9798 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9799 // of VCC, but we are still bound by the constant bus requirement to only use
9800 // one.
9801 //
9802 // If the operand's class is an SGPR, we can never move it.
9803
9804 Register SGPRReg = findImplicitSGPRRead(MI);
9805 if (SGPRReg)
9806 return SGPRReg;
9807
9808 Register UsedSGPRs[3] = {Register()};
9809 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9810
9811 for (unsigned i = 0; i < 3; ++i) {
9812 int Idx = OpIndices[i];
9813 if (Idx == -1)
9814 break;
9815
9816 const MachineOperand &MO = MI.getOperand(Idx);
9817 if (!MO.isReg())
9818 continue;
9819
9820 // Is this operand statically required to be an SGPR based on the operand
9821 // constraints?
9822 const TargetRegisterClass *OpRC =
9823 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9824 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9825 if (IsRequiredSGPR)
9826 return MO.getReg();
9827
9828 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9829 Register Reg = MO.getReg();
9830 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9831 if (RI.isSGPRClass(RegRC))
9832 UsedSGPRs[i] = Reg;
9833 }
9834
9835 // We don't have a required SGPR operand, so we have a bit more freedom in
9836 // selecting operands to move.
9837
9838 // Try to select the most used SGPR. If an SGPR is equal to one of the
9839 // others, we choose that.
9840 //
9841 // e.g.
9842 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9843 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9844
9845 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9846 // prefer those.
9847
9848 if (UsedSGPRs[0]) {
9849 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9850 SGPRReg = UsedSGPRs[0];
9851 }
9852
9853 if (!SGPRReg && UsedSGPRs[1]) {
9854 if (UsedSGPRs[1] == UsedSGPRs[2])
9855 SGPRReg = UsedSGPRs[1];
9856 }
9857
9858 return SGPRReg;
9859}
9860
9862 AMDGPU::OpName OperandName) const {
9863 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9864 return nullptr;
9865
9866 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9867 if (Idx == -1)
9868 return nullptr;
9869
9870 return &MI.getOperand(Idx);
9871}
9872
9874 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9875 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9878 return (Format << 44) |
9879 (1ULL << 56) | // RESOURCE_LEVEL = 1
9880 (3ULL << 60); // OOB_SELECT = 3
9881 }
9882
9883 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9884 if (ST.isAmdHsaOS()) {
9885 // Set ATC = 1. GFX9 doesn't have this bit.
9886 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9887 RsrcDataFormat |= (1ULL << 56);
9888
9889 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9890 // BTW, it disables TC L2 and therefore decreases performance.
9891 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9892 RsrcDataFormat |= (2ULL << 59);
9893 }
9894
9895 return RsrcDataFormat;
9896}
9897
9901 0xffffffff; // Size;
9902
9903 // GFX9 doesn't have ELEMENT_SIZE.
9904 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9905 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9906 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9907 }
9908
9909 // IndexStride = 64 / 32.
9910 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9911 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9912
9913 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9914 // Clear them unless we want a huge stride.
9915 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9916 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9917 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9918
9919 return Rsrc23;
9920}
9921
9923 unsigned Opc = MI.getOpcode();
9924
9925 return isSMRD(Opc);
9926}
9927
9929 return get(Opc).mayLoad() &&
9930 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9931}
9932
9934 TypeSize &MemBytes) const {
9935 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9936 if (!Addr || !Addr->isFI())
9937 return Register();
9938
9939 assert(!MI.memoperands_empty() &&
9940 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9941
9942 FrameIndex = Addr->getIndex();
9943
9944 int VDataIdx =
9945 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
9946 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), VDataIdx));
9947 return MI.getOperand(VDataIdx).getReg();
9948}
9949
9951 TypeSize &MemBytes) const {
9952 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9953 assert(Addr && Addr->isFI());
9954 FrameIndex = Addr->getIndex();
9955
9956 int DataIdx =
9957 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::data);
9958 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), DataIdx));
9959 return MI.getOperand(DataIdx).getReg();
9960}
9961
9963 int &FrameIndex,
9964 TypeSize &MemBytes) const {
9965 if (!MI.mayLoad())
9966 return Register();
9967
9968 if (isMUBUF(MI) || isVGPRSpill(MI))
9969 return isStackAccess(MI, FrameIndex, MemBytes);
9970
9971 if (isSGPRSpill(MI))
9972 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9973
9974 return Register();
9975}
9976
9978 int &FrameIndex,
9979 TypeSize &MemBytes) const {
9980 if (!MI.mayStore())
9981 return Register();
9982
9983 if (isMUBUF(MI) || isVGPRSpill(MI))
9984 return isStackAccess(MI, FrameIndex, MemBytes);
9985
9986 if (isSGPRSpill(MI))
9987 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9988
9989 return Register();
9990}
9991
9993 unsigned Opc = MI.getOpcode();
9995 unsigned DescSize = Desc.getSize();
9996
9997 // If we have a definitive size, we can use it. Otherwise we need to inspect
9998 // the operands to know the size.
9999 if (isFixedSize(MI)) {
10000 unsigned Size = DescSize;
10001
10002 // If we hit the buggy offset, an extra nop will be inserted in MC so
10003 // estimate the worst case.
10004 if (MI.isBranch() && ST.hasOffset3fBug())
10005 Size += 4;
10006
10007 return Size;
10008 }
10009
10010 // Instructions may have a 32-bit literal encoded after them. Check
10011 // operands that could ever be literals.
10012 if (isVALU(MI) || isSALU(MI)) {
10013 if (isDPP(MI))
10014 return DescSize;
10015 bool HasLiteral = false;
10016 unsigned LiteralSize = 4;
10017 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
10018 const MachineOperand &Op = MI.getOperand(I);
10019 const MCOperandInfo &OpInfo = Desc.operands()[I];
10020 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
10021 HasLiteral = true;
10022 if (ST.has64BitLiterals()) {
10023 switch (OpInfo.OperandType) {
10024 default:
10025 break;
10028 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
10029 LiteralSize = 8;
10030 break;
10033 // A 32-bit literal is only valid when the value fits in BOTH signed
10034 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
10035 // emitter's getLit64Encoding logic. This is because of the lack of
10036 // abilility to tell signedness of the literal, therefore we need to
10037 // be conservative and assume values outside this range require a
10038 // 64-bit literal encoding (8 bytes).
10039 if (!Op.isImm() || !isInt<32>(Op.getImm()) ||
10040 !isUInt<32>(Op.getImm()))
10041 LiteralSize = 8;
10042 break;
10043 }
10044 }
10045 break;
10046 }
10047 }
10048 return HasLiteral ? DescSize + LiteralSize : DescSize;
10049 }
10050
10051 // Check whether we have extra NSA words.
10052 if (isMIMG(MI)) {
10053 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
10054 if (VAddr0Idx < 0)
10055 return 8;
10056
10057 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
10058 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
10059 }
10060
10061 switch (Opc) {
10062 case TargetOpcode::BUNDLE:
10063 return getInstBundleSize(MI);
10064 case TargetOpcode::INLINEASM:
10065 case TargetOpcode::INLINEASM_BR: {
10066 const MachineFunction *MF = MI.getMF();
10067 const char *AsmStr = MI.getOperand(0).getSymbolName();
10068 return getInlineAsmLength(AsmStr, MF->getTarget().getMCAsmInfo(), &ST);
10069 }
10070 default:
10071 if (MI.isMetaInstruction())
10072 return 0;
10073
10074 // If D16 Pseudo inst, get correct MC code size
10075 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
10076 if (D16Info) {
10077 // Assume d16_lo/hi inst are always in same size
10078 unsigned LoInstOpcode = D16Info->LoOp;
10079 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
10080 DescSize = Desc.getSize();
10081 }
10082
10083 // If FMA Pseudo inst, get correct MC code size
10084 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
10085 // All potential lowerings are the same size; arbitrarily pick one.
10086 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
10087 DescSize = Desc.getSize();
10088 }
10089
10090 return DescSize;
10091 }
10092}
10093
10096 if (MI.isBranch() && ST.hasOffset3fBug())
10097 return InstSizeVerifyMode::NoVerify;
10098 return InstSizeVerifyMode::ExactSize;
10099}
10100
10102 if (!isFLAT(MI))
10103 return false;
10104
10105 if (MI.memoperands_empty())
10106 return true;
10107
10108 for (const MachineMemOperand *MMO : MI.memoperands()) {
10110 return true;
10111 }
10112 return false;
10113}
10114
10117 static const std::pair<int, const char *> TargetIndices[] = {
10118 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
10119 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
10120 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
10121 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
10122 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
10123 return ArrayRef(TargetIndices);
10124}
10125
10126/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
10127/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
10130 const ScheduleDAG *DAG) const {
10131 return new GCNHazardRecognizer(DAG->MF);
10132}
10133
10134/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
10135/// pass.
10138 MachineLoopInfo *MLI) const {
10139 return new GCNHazardRecognizer(MF, MLI);
10140}
10141
10142// Called during:
10143// - pre-RA scheduling and post-RA scheduling
10146 const ScheduleDAGMI *DAG) const {
10147 // Borrowed from Arm Target
10148 // We would like to restrict this hazard recognizer to only
10149 // post-RA scheduling; we can tell that we're post-RA because we don't
10150 // track VRegLiveness.
10151 if (!DAG->hasVRegLiveness())
10152 return new GCNHazardRecognizer(DAG->MF);
10154}
10155
10156std::pair<unsigned, unsigned>
10158 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
10159}
10160
10163 static const std::pair<unsigned, const char *> TargetFlags[] = {
10164 {MO_GOTPCREL, "amdgpu-gotprel"},
10165 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
10166 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
10167 {MO_GOTPCREL64, "amdgpu-gotprel64"},
10168 {MO_REL32_LO, "amdgpu-rel32-lo"},
10169 {MO_REL32_HI, "amdgpu-rel32-hi"},
10170 {MO_REL64, "amdgpu-rel64"},
10171 {MO_ABS32_LO, "amdgpu-abs32-lo"},
10172 {MO_ABS32_HI, "amdgpu-abs32-hi"},
10173 {MO_ABS64, "amdgpu-abs64"},
10174 };
10175
10176 return ArrayRef(TargetFlags);
10177}
10178
10181 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
10182 {
10183 {MONoClobber, "amdgpu-noclobber"},
10184 {MOLastUse, "amdgpu-last-use"},
10185 {MOCooperative, "amdgpu-cooperative"},
10186 {MOThreadPrivate, "amdgpu-thread-private"},
10187 };
10188
10189 return ArrayRef(TargetFlags);
10190}
10191
10193 const MachineFunction &MF) const {
10195 assert(SrcReg.isVirtual());
10196 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
10197 return AMDGPU::WWM_COPY;
10198
10199 return AMDGPU::COPY;
10200}
10201
10203 uint32_t Opcode = MI.getOpcode();
10204 // Check if it is SGPR spill or wwm-register spill Opcode.
10205 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
10206 return true;
10207
10208 const MachineFunction *MF = MI.getMF();
10209 const MachineRegisterInfo &MRI = MF->getRegInfo();
10211
10212 // See if this is Liverange split instruction inserted for SGPR or
10213 // wwm-register. The implicit def inserted for wwm-registers should also be
10214 // included as they can appear at the bb begin.
10215 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
10216 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
10217 return false;
10218
10219 Register Reg = MI.getOperand(0).getReg();
10220 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
10221 return IsLRSplitInst;
10222
10223 return MFI->isWWMReg(Reg);
10224}
10225
10227 Register Reg) const {
10228 // We need to handle instructions which may be inserted during register
10229 // allocation to handle the prolog. The initial prolog instruction may have
10230 // been separated from the start of the block by spills and copies inserted
10231 // needed by the prolog. However, the insertions for scalar registers can
10232 // always be placed at the BB top as they are independent of the exec mask
10233 // value.
10234 bool IsNullOrVectorRegister = true;
10235 if (Reg) {
10236 const MachineFunction *MF = MI.getMF();
10237 const MachineRegisterInfo &MRI = MF->getRegInfo();
10238 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
10239 }
10240
10241 return IsNullOrVectorRegister &&
10242 (canAddToBBProlog(MI) ||
10243 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10244 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
10245}
10246
10250 const DebugLoc &DL,
10251 Register DestReg) const {
10252 if (ST.hasAddNoCarryInsts())
10253 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
10254
10255 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10256 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
10257 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
10258
10259 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10260 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10261}
10262
10265 const DebugLoc &DL,
10266 Register DestReg,
10267 RegScavenger &RS) const {
10268 if (ST.hasAddNoCarryInsts())
10269 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
10270
10271 // If available, prefer to use vcc.
10272 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
10273 ? Register(RI.getVCC())
10274 : RS.scavengeRegisterBackwards(
10275 *RI.getBoolRC(), I, /* RestoreAfter */ false,
10276 0, /* AllowSpill */ false);
10277
10278 // TODO: Users need to deal with this.
10279 if (!UnusedCarry.isValid())
10280 return MachineInstrBuilder();
10281
10282 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10283 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10284}
10285
10286bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10287 switch (Opcode) {
10288 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10289 case AMDGPU::SI_KILL_I1_TERMINATOR:
10290 return true;
10291 default:
10292 return false;
10293 }
10294}
10295
10297 switch (Opcode) {
10298 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10299 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10300 case AMDGPU::SI_KILL_I1_PSEUDO:
10301 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10302 default:
10303 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10304 }
10305}
10306
10307bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10308 return Imm <= getMaxMUBUFImmOffset(ST);
10309}
10310
10312 // GFX12 field is non-negative 24-bit signed byte offset.
10313 const unsigned OffsetBits =
10314 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10315 return (1 << OffsetBits) - 1;
10316}
10317
10319 if (!ST.isWave32())
10320 return;
10321
10322 if (MI.isInlineAsm())
10323 return;
10324
10325 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10326 return;
10327
10328 for (auto &Op : MI.implicit_operands()) {
10329 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10330 Op.setReg(AMDGPU::VCC_LO);
10331 }
10332}
10333
10335 if (!isSMRD(MI))
10336 return false;
10337
10338 // Check that it is using a buffer resource.
10339 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10340 if (Idx == -1) // e.g. s_memtime
10341 return false;
10342
10343 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10344 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10345}
10346
10347// Given Imm, split it into the values to put into the SOffset and ImmOffset
10348// fields in an MUBUF instruction. Return false if it is not possible (due to a
10349// hardware bug needing a workaround).
10350//
10351// The required alignment ensures that individual address components remain
10352// aligned if they are aligned to begin with. It also ensures that additional
10353// offsets within the given alignment can be added to the resulting ImmOffset.
10355 uint32_t &ImmOffset, Align Alignment) const {
10356 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10357 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10358 uint32_t Overflow = 0;
10359
10360 if (Imm > MaxImm) {
10361 if (Imm <= MaxImm + 64) {
10362 // Use an SOffset inline constant for 4..64
10363 Overflow = Imm - MaxImm;
10364 Imm = MaxImm;
10365 } else {
10366 // Try to keep the same value in SOffset for adjacent loads, so that
10367 // the corresponding register contents can be re-used.
10368 //
10369 // Load values with all low-bits (except for alignment bits) set into
10370 // SOffset, so that a larger range of values can be covered using
10371 // s_movk_i32.
10372 //
10373 // Atomic operations fail to work correctly when individual address
10374 // components are unaligned, even if their sum is aligned.
10375 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10376 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10377 Imm = Low;
10378 Overflow = High - Alignment.value();
10379 }
10380 }
10381
10382 if (Overflow > 0) {
10383 // There is a hardware bug in SI and CI which prevents address clamping in
10384 // MUBUF instructions from working correctly with SOffsets. The immediate
10385 // offset is unaffected.
10386 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10387 return false;
10388
10389 // It is not possible to set immediate in SOffset field on some targets.
10390 if (ST.hasRestrictedSOffset())
10391 return false;
10392 }
10393
10394 ImmOffset = Imm;
10395 SOffset = Overflow;
10396 return true;
10397}
10398
10399// Depending on the used address space and instructions, some immediate offsets
10400// are allowed and some are not.
10401// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10402// scratch instruction offsets can also be negative. On GFX12, offsets can be
10403// negative for all variants.
10404//
10405// There are several bugs related to these offsets:
10406// On gfx10.1, flat instructions that go into the global address space cannot
10407// use an offset.
10408//
10409// For scratch instructions, the address can be either an SGPR or a VGPR.
10410// The following offsets can be used, depending on the architecture (x means
10411// cannot be used):
10412// +----------------------------+------+------+
10413// | Address-Mode | SGPR | VGPR |
10414// +----------------------------+------+------+
10415// | gfx9 | | |
10416// | negative, 4-aligned offset | x | ok |
10417// | negative, unaligned offset | x | ok |
10418// +----------------------------+------+------+
10419// | gfx10 | | |
10420// | negative, 4-aligned offset | ok | ok |
10421// | negative, unaligned offset | ok | x |
10422// +----------------------------+------+------+
10423// | gfx10.3 | | |
10424// | negative, 4-aligned offset | ok | ok |
10425// | negative, unaligned offset | ok | ok |
10426// +----------------------------+------+------+
10427//
10428// This function ignores the addressing mode, so if an offset cannot be used in
10429// one addressing mode, it is considered illegal.
10430bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10431 AMDGPU::FlatAddrSpace FlatVariant) const {
10432 // TODO: Should 0 be special cased?
10433 if (!ST.hasFlatInstOffsets())
10434 return false;
10435
10437 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == FlatAddrSpace::FLAT &&
10438 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10439 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10440 return false;
10441
10442 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10443 FlatVariant == FlatAddrSpace::FlatScratch && Offset < 0 &&
10444 (Offset % 4) != 0) {
10445 return false;
10446 }
10447
10448 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10449 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10450 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10451}
10452
10453// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10454std::pair<int64_t, int64_t>
10455SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10456 AMDGPU::FlatAddrSpace FlatVariant) const {
10457 int64_t RemainderOffset = COffsetVal;
10458 int64_t ImmField = 0;
10459
10460 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10461 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10462
10463 if (AllowNegative) {
10464 // Use signed division by a power of two to truncate towards 0.
10465 int64_t D = 1LL << NumBits;
10466 RemainderOffset = (COffsetVal / D) * D;
10467 ImmField = COffsetVal - RemainderOffset;
10468
10469 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10470 FlatVariant == AMDGPU::FlatAddrSpace::FlatScratch && ImmField < 0 &&
10471 (ImmField % 4) != 0) {
10472 // Make ImmField a multiple of 4
10473 RemainderOffset += ImmField % 4;
10474 ImmField -= ImmField % 4;
10475 }
10476 } else if (COffsetVal >= 0) {
10477 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10478 RemainderOffset = COffsetVal - ImmField;
10479 }
10480
10481 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10482 assert(RemainderOffset + ImmField == COffsetVal);
10483 return {ImmField, RemainderOffset};
10484}
10485
10487 AMDGPU::FlatAddrSpace FlatVariant) const {
10488 if (ST.hasNegativeScratchOffsetBug() &&
10490 return false;
10491
10492 return FlatVariant != AMDGPU::FlatAddrSpace::FLAT || AMDGPU::isGFX12Plus(ST);
10493}
10494
10495static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10496 switch (ST.getGeneration()) {
10497 default:
10498 break;
10501 return SIEncodingFamily::SI;
10504 return SIEncodingFamily::VI;
10508 return ST.hasGFX11_7Insts() ? SIEncodingFamily::GFX1170
10511 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10515 }
10516 llvm_unreachable("Unknown subtarget generation!");
10517}
10518
10519bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10520 switch(MCOp) {
10521 // These opcodes use indirect register addressing so
10522 // they need special handling by codegen (currently missing).
10523 // Therefore it is too risky to allow these opcodes
10524 // to be selected by dpp combiner or sdwa peepholer.
10525 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10526 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10527 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10528 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10529 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10530 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10531 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10532 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10533 return true;
10534 default:
10535 return false;
10536 }
10537}
10538
10539#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10540 case OPCODE##_dpp: \
10541 case OPCODE##_e32: \
10542 case OPCODE##_e64: \
10543 case OPCODE##_e64_dpp: \
10544 case OPCODE##_sdwa:
10545
10546static bool isRenamedInGFX9(int Opcode) {
10547 switch (Opcode) {
10548 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10549 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10550 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10551 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10552 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10553 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10554 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10555 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10556 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10557 //
10558 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10559 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10560 case AMDGPU::V_FMA_F16_gfx9_e64:
10561 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10562 case AMDGPU::V_INTERP_P2_F16:
10563 case AMDGPU::V_MAD_F16_e64:
10564 case AMDGPU::V_MAD_U16_e64:
10565 case AMDGPU::V_MAD_I16_e64:
10566 return true;
10567 default:
10568 return false;
10569 }
10570}
10571
10572int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10573 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10574 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10575
10576 unsigned Gen = subtargetEncodingFamily(ST);
10577
10578 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10580
10581 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10582 // subtarget has UnpackedD16VMem feature.
10583 // TODO: remove this when we discard GFX80 encoding.
10584 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10586
10587 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10588 switch (ST.getGeneration()) {
10589 default:
10591 break;
10594 break;
10597 break;
10598 }
10599 }
10600
10601 if (isMAI(Opcode)) {
10602 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10603 if (MFMAOp != -1)
10604 Opcode = MFMAOp;
10605 }
10606
10607 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10608
10609 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX11_7Insts())
10611
10612 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10614
10615 // -1 means that Opcode is already a native instruction.
10616 if (MCOp == -1)
10617 return Opcode;
10618
10619 if (ST.hasGFX90AInsts()) {
10620 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10621 if (ST.hasGFX940Insts())
10623 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10625 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10627 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10628 MCOp = NMCOp;
10629 }
10630
10631 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10632 // encoding in the given subtarget generation.
10633 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10634 return -1;
10635
10636 if (isAsmOnlyOpcode(MCOp))
10637 return -1;
10638
10639 return MCOp;
10640}
10641
10642static
10644 assert(RegOpnd.isReg());
10645 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10646 getRegSubRegPair(RegOpnd);
10647}
10648
10651 assert(MI.isRegSequence());
10652 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10653 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10654 auto &RegOp = MI.getOperand(1 + 2 * I);
10655 return getRegOrUndef(RegOp);
10656 }
10658}
10659
10660// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10661// Following a subreg of reg:subreg isn't supported
10664 if (!RSR.SubReg)
10665 return false;
10666 switch (MI.getOpcode()) {
10667 default: break;
10668 case AMDGPU::REG_SEQUENCE:
10669 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10670 return true;
10671 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10672 case AMDGPU::INSERT_SUBREG:
10673 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10674 // inserted the subreg we're looking for
10675 RSR = getRegOrUndef(MI.getOperand(2));
10676 else { // the subreg in the rest of the reg
10677 auto R1 = getRegOrUndef(MI.getOperand(1));
10678 if (R1.SubReg) // subreg of subreg isn't supported
10679 return false;
10680 RSR.Reg = R1.Reg;
10681 }
10682 return true;
10683 }
10684 return false;
10685}
10686
10688 const MachineRegisterInfo &MRI) {
10689 assert(MRI.isSSA());
10690 if (!P.Reg.isVirtual())
10691 return nullptr;
10692
10693 auto RSR = P;
10694 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10695 while (auto *MI = DefInst) {
10696 DefInst = nullptr;
10697 switch (MI->getOpcode()) {
10698 case AMDGPU::COPY:
10699 case AMDGPU::V_MOV_B32_e32: {
10700 auto &Op1 = MI->getOperand(1);
10701 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10702 if (Op1.isUndef())
10703 return nullptr;
10704 RSR = getRegSubRegPair(Op1);
10705 DefInst = MRI.getVRegDef(RSR.Reg);
10706 }
10707 break;
10708 }
10709 default:
10710 if (followSubRegDef(*MI, RSR)) {
10711 if (!RSR.Reg)
10712 return nullptr;
10713 DefInst = MRI.getVRegDef(RSR.Reg);
10714 }
10715 }
10716 if (!DefInst)
10717 return MI;
10718 }
10719 return nullptr;
10720}
10721
10723 Register VReg,
10724 const MachineInstr &DefMI,
10725 const MachineInstr &UseMI) {
10726 assert(MRI.isSSA() && "Must be run on SSA");
10727
10728 auto *TRI = MRI.getTargetRegisterInfo();
10729 auto *DefBB = DefMI.getParent();
10730
10731 // Don't bother searching between blocks, although it is possible this block
10732 // doesn't modify exec.
10733 if (UseMI.getParent() != DefBB)
10734 return true;
10735
10736 const int MaxInstScan = 20;
10737 int NumInst = 0;
10738
10739 // Stop scan at the use.
10740 auto E = UseMI.getIterator();
10741 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10742 if (I->isDebugInstr())
10743 continue;
10744
10745 if (++NumInst > MaxInstScan)
10746 return true;
10747
10748 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10749 return true;
10750 }
10751
10752 return false;
10753}
10754
10756 Register VReg,
10757 const MachineInstr &DefMI) {
10758 assert(MRI.isSSA() && "Must be run on SSA");
10759
10760 auto *TRI = MRI.getTargetRegisterInfo();
10761 auto *DefBB = DefMI.getParent();
10762
10763 const int MaxUseScan = 10;
10764 int NumUse = 0;
10765
10766 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10767 auto &UseInst = *Use.getParent();
10768 // Don't bother searching between blocks, although it is possible this block
10769 // doesn't modify exec.
10770 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10771 return true;
10772
10773 if (++NumUse > MaxUseScan)
10774 return true;
10775 }
10776
10777 if (NumUse == 0)
10778 return false;
10779
10780 const int MaxInstScan = 20;
10781 int NumInst = 0;
10782
10783 // Stop scan when we have seen all the uses.
10784 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10785 assert(I != DefBB->end());
10786
10787 if (I->isDebugInstr())
10788 continue;
10789
10790 if (++NumInst > MaxInstScan)
10791 return true;
10792
10793 for (const MachineOperand &Op : I->operands()) {
10794 // We don't check reg masks here as they're used only on calls:
10795 // 1. EXEC is only considered const within one BB
10796 // 2. Call should be a terminator instruction if present in a BB
10797
10798 if (!Op.isReg())
10799 continue;
10800
10801 Register Reg = Op.getReg();
10802 if (Op.isUse()) {
10803 if (Reg == VReg && --NumUse == 0)
10804 return false;
10805 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10806 return true;
10807 }
10808 }
10809}
10810
10813 const DebugLoc &DL, Register Src, Register Dst) const {
10814 auto Cur = MBB.begin();
10815 if (Cur != MBB.end())
10816 do {
10817 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10818 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10819 ++Cur;
10820 } while (Cur != MBB.end() && Cur != LastPHIIt);
10821
10822 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10823 Dst);
10824}
10825
10828 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10829 if (InsPt != MBB.end() &&
10830 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10831 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10832 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10833 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10834 InsPt++;
10835 return BuildMI(MBB, InsPt, DL,
10836 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10837 .addReg(Src, {}, SrcSubReg)
10838 .addReg(AMDGPU::EXEC, RegState::Implicit);
10839 }
10840 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10841 Dst);
10842}
10843
10844bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10845
10847 const MachineInstr &SecondMI) const {
10848 for (const auto &Use : SecondMI.all_uses()) {
10849 if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), &RI))
10850 return true;
10851 }
10852 return false;
10853}
10854
10855/// If OpX is multicycle, anti-dependencies are not allowed.
10856/// isDPMACCInstruction was not designed for VOPD, but it is fit for the
10857/// purpose.
10859 const MachineInstr &OpX) const {
10861}
10862
10865 ArrayRef<unsigned> Ops, int FrameIndex,
10866 MachineInstr *&CopyMI, LiveIntervals *LIS,
10867 VirtRegMap *VRM) const {
10868 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10869 //
10870 // %0:sreg_32 = COPY $m0
10871 //
10872 // We explicitly chose SReg_32 for the virtual register so such a copy might
10873 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10874 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10875 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10876 // TargetInstrInfo::foldMemoryOperand() is going to try.
10877 // A similar issue also exists with spilling and reloading $exec registers.
10878 //
10879 // To prevent that, constrain the %0 register class here.
10880 if (isFullCopyInstr(MI)) {
10881 Register DstReg = MI.getOperand(0).getReg();
10882 Register SrcReg = MI.getOperand(1).getReg();
10883 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10884 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10885 MachineRegisterInfo &MRI = MF.getRegInfo();
10886 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10887 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10888 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10889 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10890 return nullptr;
10891 }
10892 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10893 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10894 return nullptr;
10895 }
10896 }
10897 }
10898
10899 return nullptr;
10900}
10901
10903 const MachineInstr &MI,
10904 unsigned *PredCost) const {
10905 if (MI.isBundle()) {
10907 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10908 unsigned Lat = 0, Count = 0;
10909 for (++I; I != E && I->isBundledWithPred(); ++I) {
10910 ++Count;
10911 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10912 }
10913 return Lat + Count - 1;
10914 }
10915
10916 return SchedModel.computeInstrLatency(&MI);
10917}
10918
10919const MachineOperand &
10921 if (const MachineOperand *CallAddrOp =
10922 getNamedOperand(MI, AMDGPU::OpName::src0))
10923 return *CallAddrOp;
10925}
10926
10929 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10930 unsigned Opcode = MI.getOpcode();
10931
10932 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10933 Register Dst = MI.getOperand(0).getReg();
10934 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10935 : MI.getOperand(1).getReg();
10936 LLT DstTy = MRI.getType(Dst);
10937 LLT SrcTy = MRI.getType(Src);
10938 unsigned DstAS = DstTy.getAddressSpace();
10939 unsigned SrcAS = SrcTy.getAddressSpace();
10940 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10941 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10942 ST.hasGloballyAddressableScratch()
10945 };
10946
10947 // If the target supports globally addressable scratch, the mapping from
10948 // scratch memory to the flat aperture changes therefore an address space cast
10949 // is no longer uniform.
10950 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10951 return HandleAddrSpaceCast(MI);
10952
10953 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10954 auto IID = GI->getIntrinsicID();
10959
10960 switch (IID) {
10961 case Intrinsic::amdgcn_addrspacecast_nonnull:
10962 return HandleAddrSpaceCast(MI);
10963 case Intrinsic::amdgcn_if:
10964 case Intrinsic::amdgcn_else:
10965 // FIXME: Uniform if second result
10966 break;
10967 }
10968
10970 }
10971
10972 // Loads from the private and flat address spaces are divergent, because
10973 // threads can execute the load instruction with the same inputs and get
10974 // different results.
10975 //
10976 // All other loads are not divergent, because if threads issue loads with the
10977 // same arguments, they will always get the same result.
10978 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10979 Opcode == AMDGPU::G_SEXTLOAD) {
10980 if (MI.memoperands_empty())
10981 return ValueUniformity::NeverUniform; // conservative assumption
10982
10983 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10984 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10985 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10986 })) {
10987 // At least one MMO in a non-global address space.
10989 }
10991 }
10992
10993 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10994 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10995 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10996 AMDGPU::isGenericAtomic(Opcode)) {
10998 }
10999
11000 // Result is computed from uniform SP and uniform wave-wide max size.
11001 if (Opcode == TargetOpcode::G_DYN_STACKALLOC)
11003
11004 if (Opcode == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
11006
11008}
11009
11011 if (!Formatter)
11012 Formatter = std::make_unique<AMDGPUMIRFormatter>(ST);
11013 return Formatter.get();
11014}
11015
11017
11018 if (isNeverUniform(MI))
11020
11021 unsigned opcode = MI.getOpcode();
11022 if (opcode == AMDGPU::V_READLANE_B32 ||
11023 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
11024 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
11026
11027 // If any of defs is divergent, report as NeverUniform. isUniformReg will
11028 // calculate in more detail for each def from its reg class, if available.
11029 if (MI.isInlineAsm()) {
11030 for (const MachineOperand &MO : MI.operands()) {
11031 if (!MO.isReg() || !MO.isDef())
11032 continue;
11033 const TargetRegisterClass *RC =
11034 MI.getRegClassConstraint(MO.getOperandNo(), this, &RI);
11035 if (!RC || !RI.isSGPRClass(RC))
11037 }
11038 }
11039
11040 if (isCopyInstr(MI)) {
11041 const MachineOperand &srcOp = MI.getOperand(1);
11042 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
11043 const TargetRegisterClass *regClass =
11044 RI.getPhysRegBaseClass(srcOp.getReg());
11045 return RI.isSGPRClass(regClass) ? ValueUniformity::AlwaysUniform
11047 }
11049 }
11050
11051 // GMIR handling
11052 if (MI.isPreISelOpcode())
11054
11055 // Atomics are divergent because they are executed sequentially: when an
11056 // atomic operation refers to the same address in each thread, then each
11057 // thread after the first sees the value written by the previous thread as
11058 // original value.
11059
11060 if (isAtomic(MI))
11062
11063 // Loads from the private and flat address spaces are divergent, because
11064 // threads can execute the load instruction with the same inputs and get
11065 // different results.
11066 if (isFLAT(MI) && MI.mayLoad()) {
11067 if (MI.memoperands_empty())
11068 return ValueUniformity::NeverUniform; // conservative assumption
11069
11070 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
11071 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
11072 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
11073 })) {
11074 // At least one MMO in a non-global address space.
11076 }
11077
11079 }
11080
11081 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
11082 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
11083
11084 // FIXME: It's conceptually broken to report this for an instruction, and not
11085 // a specific def operand. For inline asm in particular, there could be mixed
11086 // uniform and divergent results.
11087 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
11088 const MachineOperand &SrcOp = MI.getOperand(I);
11089 if (!SrcOp.isReg())
11090 continue;
11091
11092 Register Reg = SrcOp.getReg();
11093 if (!Reg || !SrcOp.readsReg())
11094 continue;
11095
11096 // If RegBank is null, this is unassigned or an unallocatable special
11097 // register, which are all scalars.
11098 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
11099 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
11101 }
11102
11103 // TODO: Uniformity check condtions above can be rearranged for more
11104 // redability
11105
11106 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
11107 // currently turned into no-op COPYs by SelectionDAG ISel and are
11108 // therefore no longer recognizable.
11109
11111}
11112
11114 switch (MF.getFunction().getCallingConv()) {
11116 return 1;
11118 return 2;
11120 return 3;
11124 const Function &F = MF.getFunction();
11125 F.getContext().diagnose(DiagnosticInfoUnsupported(
11126 F, "ds_ordered_count unsupported for this calling conv"));
11127 [[fallthrough]];
11128 }
11131 case CallingConv::C:
11132 case CallingConv::Fast:
11133 default:
11134 // Assume other calling conventions are various compute callable functions
11135 return 0;
11136 }
11137}
11138
11140 Register &SrcReg2, int64_t &CmpMask,
11141 int64_t &CmpValue) const {
11142 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
11143 return false;
11144
11145 switch (MI.getOpcode()) {
11146 default:
11147 break;
11148 case AMDGPU::S_CMP_EQ_U32:
11149 case AMDGPU::S_CMP_EQ_I32:
11150 case AMDGPU::S_CMP_LG_U32:
11151 case AMDGPU::S_CMP_LG_I32:
11152 case AMDGPU::S_CMP_LT_U32:
11153 case AMDGPU::S_CMP_LT_I32:
11154 case AMDGPU::S_CMP_GT_U32:
11155 case AMDGPU::S_CMP_GT_I32:
11156 case AMDGPU::S_CMP_LE_U32:
11157 case AMDGPU::S_CMP_LE_I32:
11158 case AMDGPU::S_CMP_GE_U32:
11159 case AMDGPU::S_CMP_GE_I32:
11160 case AMDGPU::S_CMP_EQ_U64:
11161 case AMDGPU::S_CMP_LG_U64:
11162 SrcReg = MI.getOperand(0).getReg();
11163 if (MI.getOperand(1).isReg()) {
11164 if (MI.getOperand(1).getSubReg())
11165 return false;
11166 SrcReg2 = MI.getOperand(1).getReg();
11167 CmpValue = 0;
11168 } else if (MI.getOperand(1).isImm()) {
11169 SrcReg2 = Register();
11170 CmpValue = MI.getOperand(1).getImm();
11171 } else {
11172 return false;
11173 }
11174 CmpMask = ~0;
11175 return true;
11176 case AMDGPU::S_CMPK_EQ_U32:
11177 case AMDGPU::S_CMPK_EQ_I32:
11178 case AMDGPU::S_CMPK_LG_U32:
11179 case AMDGPU::S_CMPK_LG_I32:
11180 case AMDGPU::S_CMPK_LT_U32:
11181 case AMDGPU::S_CMPK_LT_I32:
11182 case AMDGPU::S_CMPK_GT_U32:
11183 case AMDGPU::S_CMPK_GT_I32:
11184 case AMDGPU::S_CMPK_LE_U32:
11185 case AMDGPU::S_CMPK_LE_I32:
11186 case AMDGPU::S_CMPK_GE_U32:
11187 case AMDGPU::S_CMPK_GE_I32:
11188 SrcReg = MI.getOperand(0).getReg();
11189 SrcReg2 = Register();
11190 CmpValue = MI.getOperand(1).getImm();
11191 CmpMask = ~0;
11192 return true;
11193 }
11194
11195 return false;
11196}
11197
11199 for (MachineBasicBlock *S : MBB->successors()) {
11200 if (S->isLiveIn(AMDGPU::SCC))
11201 return false;
11202 }
11203 return true;
11204}
11205
11206// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
11207// (incoming SCC) = !(SCC defined by SCCDef).
11208// Return true if all uses can be re-written, false otherwise.
11209bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
11210 MachineBasicBlock *MBB = SCCDef->getParent();
11211 SmallVector<MachineInstr *> InvertInstr;
11212 bool SCCIsDead = false;
11213
11214 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
11215 constexpr unsigned ScanLimit = 12;
11216 unsigned Count = 0;
11217 for (MachineInstr &MI :
11218 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
11219 if (++Count > ScanLimit)
11220 return false;
11221 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
11222 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
11223 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
11224 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11225 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
11226 InvertInstr.push_back(&MI);
11227 else
11228 return false;
11229 }
11230 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
11231 SCCIsDead = true;
11232 break;
11233 }
11234 }
11235 if (!SCCIsDead && isSCCDeadOnExit(MBB))
11236 SCCIsDead = true;
11237
11238 // SCC may have more uses. Can't invert all of them.
11239 if (!SCCIsDead)
11240 return false;
11241
11242 // Invert uses
11243 for (MachineInstr *MI : InvertInstr) {
11244 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
11245 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
11246 swapOperands(*MI);
11247 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11248 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
11249 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
11250 ? AMDGPU::S_CBRANCH_SCC1
11251 : AMDGPU::S_CBRANCH_SCC0));
11252 } else {
11253 llvm_unreachable("SCC used but no inversion handling");
11254 }
11255 }
11256 return true;
11257}
11258
11259// SCC is already valid after SCCValid.
11260// SCCRedefine will redefine SCC to the same value already available after
11261// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
11262// update kill/dead flags if necessary.
11263bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
11264 bool NeedInversion) const {
11265 MachineInstr *KillsSCC = nullptr;
11266 if (SCCValid->getParent() != SCCRedefine->getParent())
11267 return false;
11268 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
11269 SCCRedefine->getIterator())) {
11270 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
11271 return false;
11272 if (MI.killsRegister(AMDGPU::SCC, &RI))
11273 KillsSCC = &MI;
11274 }
11275 if (NeedInversion && !invertSCCUse(SCCRedefine))
11276 return false;
11277 if (MachineOperand *SccDef =
11278 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
11279 SccDef->setIsDead(false);
11280 if (KillsSCC)
11281 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
11282 SCCRedefine->eraseFromParent();
11283 return true;
11284}
11285
11286static bool foldableSelect(const MachineInstr &Def) {
11287 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11288 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11289 return false;
11290 bool Op1IsNonZeroImm =
11291 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
11292 bool Op2IsZeroImm =
11293 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
11294 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11295 return false;
11296 return true;
11297}
11298
11299static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11300 unsigned &NewDefOpc) {
11301 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11302 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11303 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11304 Def.getOpcode() != AMDGPU::S_ADD_U32)
11305 return false;
11306 const MachineOperand &AddSrc1 = Def.getOperand(1);
11307 const MachineOperand &AddSrc2 = Def.getOperand(2);
11308 int64_t addend;
11309
11310 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11311 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11312 (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
11313 (!getFoldableImm(&AddSrc2, addend) || addend != 1))
11314 return false;
11315
11316 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11317 const MachineOperand *SccDef =
11318 Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
11319 if (!SccDef->isDead())
11320 return false;
11321 NewDefOpc = AMDGPU::S_ADD_U32;
11322 }
11323 NeedInversion = !NeedInversion;
11324 return true;
11325}
11326
11328 Register SrcReg2, int64_t CmpMask,
11329 int64_t CmpValue,
11330 const MachineRegisterInfo *MRI) const {
11331 if (!SrcReg || SrcReg.isPhysical())
11332 return false;
11333
11334 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
11335 return false;
11336
11337 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11338 this](bool NeedInversion) -> bool {
11339 if (CmpValue != 0)
11340 return false;
11341
11342 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11343 if (!Def)
11344 return false;
11345
11346 // For S_OP that set SCC = DST!=0, do the transformation
11347 //
11348 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11349 //
11350 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11351 // do the transformation:
11352 //
11353 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11354 //
11355 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11356 // for S_CSELECT* already has the same value that will be calculated by
11357 // s_cmp_lg_*
11358 //
11359 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11360 // (non-zero imm), 0)
11361
11362 unsigned NewDefOpc = Def->getOpcode();
11363 if (!setsSCCIfResultIsNonZero(*Def) &&
11364 !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
11365 !foldableSelect(*Def))
11366 return false;
11367
11368 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
11369 return false;
11370
11371 if (NewDefOpc != Def->getOpcode())
11372 Def->setDesc(get(NewDefOpc));
11373
11374 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11375 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11376 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11377 // sX = s_cselect_b64 (non-zero imm), 0
11378 // sLo = copy sX.sub0
11379 // sHi = copy sX.sub1
11380 // sY = s_or_b32 sLo, sHi
11381 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11382 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11383 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11384 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11385 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11386 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11387 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11388 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11389 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11390 Def2->getOperand(1).isReg() &&
11391 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11392 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11393 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11394 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11395 if (Select && foldableSelect(*Select))
11396 optimizeSCC(Select, Def, /*NeedInversion=*/false);
11397 }
11398 }
11399 }
11400 return true;
11401 };
11402
11403 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11404 this](int64_t ExpectedValue, unsigned SrcSize,
11405 bool IsReversible, bool IsSigned) -> bool {
11406 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11407 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11408 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11409 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11410 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11411 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11412 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11413 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11414 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11415 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11416 //
11417 // Signed ge/gt are not used for the sign bit.
11418 //
11419 // If result of the AND is unused except in the compare:
11420 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11421 //
11422 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11423 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11424 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11425 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11426 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11427 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11428
11429 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11430 if (!Def)
11431 return false;
11432
11433 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11434 Def->getOpcode() != AMDGPU::S_AND_B64)
11435 return false;
11436
11437 int64_t Mask;
11438 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11439 if (MO->isImm())
11440 Mask = MO->getImm();
11441 else if (!getFoldableImm(MO, Mask))
11442 return false;
11443 Mask &= maxUIntN(SrcSize);
11444 return isPowerOf2_64(Mask);
11445 };
11446
11447 MachineOperand *SrcOp = &Def->getOperand(1);
11448 if (isMask(SrcOp))
11449 SrcOp = &Def->getOperand(2);
11450 else if (isMask(&Def->getOperand(2)))
11451 SrcOp = &Def->getOperand(1);
11452 else
11453 return false;
11454
11455 // A valid Mask is required to have a single bit set, hence a non-zero and
11456 // power-of-two value. This verifies that we will not do 64-bit shift below.
11457 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11458 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11459 if (IsSigned && BitNo == SrcSize - 1)
11460 return false;
11461
11462 ExpectedValue <<= BitNo;
11463
11464 bool IsReversedCC = false;
11465 if (CmpValue != ExpectedValue) {
11466 if (!IsReversible)
11467 return false;
11468 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11469 if (!IsReversedCC)
11470 return false;
11471 }
11472
11473 Register DefReg = Def->getOperand(0).getReg();
11474 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11475 return false;
11476
11477 if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
11478 return false;
11479
11480 if (!MRI->use_nodbg_empty(DefReg)) {
11481 assert(!IsReversedCC);
11482 return true;
11483 }
11484
11485 // Replace AND with unused result with a S_BITCMP.
11486 MachineBasicBlock *MBB = Def->getParent();
11487
11488 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11489 : AMDGPU::S_BITCMP1_B32
11490 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11491 : AMDGPU::S_BITCMP1_B64;
11492
11493 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11494 .add(*SrcOp)
11495 .addImm(BitNo);
11496 Def->eraseFromParent();
11497
11498 return true;
11499 };
11500
11501 switch (CmpInstr.getOpcode()) {
11502 default:
11503 break;
11504 case AMDGPU::S_CMP_EQ_U32:
11505 case AMDGPU::S_CMP_EQ_I32:
11506 case AMDGPU::S_CMPK_EQ_U32:
11507 case AMDGPU::S_CMPK_EQ_I32:
11508 return optimizeCmpAnd(1, 32, true, false) ||
11509 optimizeCmpSelect(/*NeedInversion=*/true);
11510 case AMDGPU::S_CMP_GE_U32:
11511 case AMDGPU::S_CMPK_GE_U32:
11512 return optimizeCmpAnd(1, 32, false, false);
11513 case AMDGPU::S_CMP_GE_I32:
11514 case AMDGPU::S_CMPK_GE_I32:
11515 return optimizeCmpAnd(1, 32, false, true);
11516 case AMDGPU::S_CMP_EQ_U64:
11517 return optimizeCmpAnd(1, 64, true, false);
11518 case AMDGPU::S_CMP_LG_U32:
11519 case AMDGPU::S_CMP_LG_I32:
11520 case AMDGPU::S_CMPK_LG_U32:
11521 case AMDGPU::S_CMPK_LG_I32:
11522 return optimizeCmpAnd(0, 32, true, false) ||
11523 optimizeCmpSelect(/*NeedInversion=*/false);
11524 case AMDGPU::S_CMP_GT_U32:
11525 case AMDGPU::S_CMPK_GT_U32:
11526 return optimizeCmpAnd(0, 32, false, false);
11527 case AMDGPU::S_CMP_GT_I32:
11528 case AMDGPU::S_CMPK_GT_I32:
11529 return optimizeCmpAnd(0, 32, false, true);
11530 case AMDGPU::S_CMP_LG_U64:
11531 return optimizeCmpAnd(0, 64, true, false) ||
11532 optimizeCmpSelect(/*NeedInversion=*/false);
11533 }
11534
11535 return false;
11536}
11537
11539 AMDGPU::OpName OpName) const {
11540 if (!ST.needsAlignedVGPRs())
11541 return;
11542
11543 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11544 if (OpNo < 0)
11545 return;
11546 MachineOperand &Op = MI.getOperand(OpNo);
11547 if (getOpSize(MI, OpNo) > 4)
11548 return;
11549
11550 // Add implicit aligned super-reg to force alignment on the data operand.
11551 const DebugLoc &DL = MI.getDebugLoc();
11552 MachineBasicBlock *BB = MI.getParent();
11554 Register DataReg = Op.getReg();
11555 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11557 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11558 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11559 Register NewVR =
11560 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11561 : &AMDGPU::VReg_64_Align2RegClass);
11562 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11563 .addReg(DataReg, {}, Op.getSubReg())
11564 .addImm(AMDGPU::sub0)
11565 .addReg(Undef)
11566 .addImm(AMDGPU::sub1);
11567 Op.setReg(NewVR);
11568 Op.setSubReg(AMDGPU::sub0);
11569 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11570}
11571
11573 if (isIGLP(*MI))
11574 return false;
11575
11577}
11578
11580 if (!isWMMA(MI) && !isSWMMAC(MI))
11581 return false;
11582
11583 if (ST.hasGFX1250Insts())
11584 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11585
11586 return true;
11587}
11588
11590 unsigned Opcode = MI.getOpcode();
11591
11592 if (AMDGPU::isGFX12Plus(ST))
11593 return isDOT(MI) || isXDLWMMA(MI);
11594
11595 if (!isMAI(MI) || isDGEMM(Opcode) ||
11596 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11597 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11598 return false;
11599
11600 if (!ST.hasGFX940Insts())
11601 return true;
11602
11603 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11604}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={})
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static MachineBasicBlock * generateWaterFallLoop(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr, ArrayRef< Register > PhySGPRs={})
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static unsigned getSGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI)
static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion, unsigned &NewDefOpc)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static unsigned getAVSpillSaveOpcode(unsigned Size, bool NeedsCFI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI)
static constexpr AMDGPU::OpName ModifierOpNames[]
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:484
#define LLVM_DEBUG(...)
Definition Debug.h:119
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:158
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:124
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:354
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
MIRFormater - Interface to format MIR operand based on target.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI void moveOperands(MachineOperand *Dst, MachineOperand *Src, unsigned NumOps)
Move NumOps operands from Src to Dst, updating use-def lists as needed.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
LLVM_ABI void clearVirtRegs()
clearVirtRegs - Remove all virtual registers (after physreg assignment).
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
iterator_range< use_iterator > use_operands(Register Reg) const
LLVM_ABI void removeRegOperandFromUseList(MachineOperand *MO)
Remove MO from its use-def list.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
LLVM_ABI void addRegOperandToUseList(MachineOperand *MO)
Add MO to the linked list of operands for its register.
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
bool isSpill(uint32_t Opcode) const
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, AMDGPU::FlatAddrSpace FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
void storeRegToStackSlotCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool setsSCCIfResultIsNonZero(const MachineInstr &MI)
const MIRFormatter * getMIRFormatter() const override
static bool isXcntDrain(const MachineInstr &MI)
True if MI implicitly drains XCNT.
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
InstSizeVerifyMode getInstSizeVerifyMode(const MachineInstr &MI) const override
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
Register isStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool hasRAWDependency(const MachineInstr &FirstMI, const MachineInstr &SecondMI) const
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
void handleCopyToPhysHelper(SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst, MachineRegisterInfo &MRI, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool isVOPDAntidependencyAllowed(const MachineInstr &MI) const
If OpX is multicycle, anti-dependencies are not allowed.
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void createWaterFallForSiCall(MachineInstr *MI, MachineDominatorTree *MDT, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={}) const
Wrapper function for generating waterfall for instruction MI This function take into consideration of...
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
ValueUniformity getGenericValueUniformity(const MachineInstr &MI) const
static bool isMAI(const MCInstrDesc &Desc)
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const override
static bool usesLGKM_CNT(const MachineInstr &MI)
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
bool isAlwaysGDS(uint32_t Opcode) const
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
bool isLegalGFX12PlusPackedMathFP32or64BitOperand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 or 64 instructions.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI, bool NeedsCFI) const
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, AMDGPU::FlatAddrSpace FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
void createReadFirstLaneFromCopyToPhysReg(MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool isWWMRegSpillOpcode(uint32_t Opcode)
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
ValueUniformity getValueUniformity(const MachineInstr &MI) const final
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool allowNegativeFlatOffset(AMDGPU::FlatAddrSpace FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
std::optional< int64_t > getImmOrMaterializedImm(MachineOperand &Op) const
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
unsigned getScratchReservedForDynamicVGPRs() const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:301
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo & getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:190
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isDPMACCInstruction(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int32_t getCommuteRev(uint32_t Opcode)
LLVM_READONLY int32_t getCommuteOrig(uint32_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READONLY int32_t getGlobalVaddrOp(uint32_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getIfAddr64Inst(uint32_t Opcode)
Check if Opcode is an Addr64 opcode.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
LLVM_READONLY int32_t getAddr64Inst(uint32_t Opcode)
int32_t getMCOpcode(uint32_t Opcode, unsigned Gen)
bool isPackedFP32or64BitInst(unsigned Opc)
@ OPERAND_REG_IMM_V2FP64
Definition SIDefines.h:220
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:238
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:206
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:229
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:226
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:231
@ OPERAND_REG_IMM_V2INT64
Definition SIDefines.h:216
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:215
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:205
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:212
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_V2FP16_SPLAT
Definition SIDefines.h:214
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:223
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:217
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:232
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:243
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:244
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:218
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:255
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:208
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:228
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:230
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:249
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:245
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:227
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:207
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:235
LLVM_READONLY int32_t getBasicFromSDWAOp(uint32_t Opcode)
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:614
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:616
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:613
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:615
@ TI_CONSTDATA_START
Definition AMDGPU.h:612
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:558
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
constexpr RegState getUndefRegState(bool B)
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
MachineCycleInfo::CycleT MachineCycle
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:63
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
constexpr bool all() const
Definition LaneBitmask.h:54
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:67
MachineInstr * top() const
Definition SIInstrInfo.h:72
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:91
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.