LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/MC/MCContext.h"
38
39using namespace llvm;
40
41#define DEBUG_TYPE "si-instr-info"
42
43#define GET_INSTRINFO_CTOR_DTOR
44#include "AMDGPUGenInstrInfo.inc"
45
46namespace llvm::AMDGPU {
47#define GET_D16ImageDimIntrinsics_IMPL
48#define GET_ImageDimIntrinsicTable_IMPL
49#define GET_RsrcIntrinsics_IMPL
50#include "AMDGPUGenSearchableTables.inc"
51} // namespace llvm::AMDGPU
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
68 AMDGPU::ADJCALLSTACKDOWN),
69 RI(ST), ST(ST) {
70 SchedModel.init(&ST);
71}
72
73//===----------------------------------------------------------------------===//
74// TargetInstrInfo callbacks
75//===----------------------------------------------------------------------===//
76
77static unsigned getNumOperandsNoGlue(SDNode *Node) {
78 unsigned N = Node->getNumOperands();
79 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
80 --N;
81 return N;
82}
83
84/// Returns true if both nodes have the same value for the given
85/// operand \p Op, or if both nodes do not have this operand.
87 AMDGPU::OpName OpName) {
88 unsigned Opc0 = N0->getMachineOpcode();
89 unsigned Opc1 = N1->getMachineOpcode();
90
91 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
92 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
93
94 if (Op0Idx == -1 && Op1Idx == -1)
95 return true;
96
97
98 if ((Op0Idx == -1 && Op1Idx != -1) ||
99 (Op1Idx == -1 && Op0Idx != -1))
100 return false;
101
102 // getNamedOperandIdx returns the index for the MachineInstr's operands,
103 // which includes the result as the first operand. We are indexing into the
104 // MachineSDNode's operands, so we need to skip the result operand to get
105 // the real index.
106 --Op0Idx;
107 --Op1Idx;
108
109 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
110}
111
112static bool canRemat(const MachineInstr &MI) {
113
117 return true;
118
119 if (SIInstrInfo::isSMRD(MI)) {
120 return !MI.memoperands_empty() &&
121 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
122 return MMO->isLoad() && MMO->isInvariant();
123 });
124 }
125
126 return false;
127}
128
130 const MachineInstr &MI) const {
131
132 if (canRemat(MI)) {
133 // Normally VALU use of exec would block the rematerialization, but that
134 // is OK in this case to have an implicit exec read as all VALU do.
135 // We really want all of the generic logic for this except for this.
136
137 // Another potential implicit use is mode register. The core logic of
138 // the RA will not attempt rematerialization if mode is set anywhere
139 // in the function, otherwise it is safe since mode is not changed.
140
141 // There is difference to generic method which does not allow
142 // rematerialization if there are virtual register uses. We allow this,
143 // therefore this method includes SOP instructions as well.
144 if (!MI.hasImplicitDef() &&
145 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
146 !MI.mayRaiseFPException())
147 return true;
148 }
149
151}
152
153// Returns true if the result of a VALU instruction depends on exec.
154bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
155 assert(isVALU(MI));
156
157 // If it is convergent it depends on EXEC.
158 if (MI.isConvergent())
159 return true;
160
161 // If it defines SGPR it depends on EXEC
162 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
163 for (const MachineOperand &Def : MI.defs()) {
164 if (!Def.isReg())
165 continue;
166
167 Register Reg = Def.getReg();
168 if (Reg && RI.isSGPRReg(MRI, Reg))
169 return true;
170 }
171
172 return false;
173}
174
176 // Any implicit use of exec by VALU is not a real register read.
177 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
178 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
179}
180
182 MachineBasicBlock *SuccToSinkTo,
183 MachineCycleInfo *CI) const {
184 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
185 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
186 return true;
187
188 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
189 // Check if sinking of MI would create temporal divergent use.
190 for (auto Op : MI.uses()) {
191 if (Op.isReg() && Op.getReg().isVirtual() &&
192 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
193 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
194
195 // SgprDef defined inside cycle
196 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
197 if (FromCycle == nullptr)
198 continue;
199
200 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
201 // Check if there is a FromCycle that contains SgprDef's basic block but
202 // does not contain SuccToSinkTo and also has divergent exit condition.
203 while (FromCycle && !FromCycle->contains(ToCycle)) {
205 FromCycle->getExitingBlocks(ExitingBlocks);
206
207 // FromCycle has divergent exit condition.
208 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
209 if (hasDivergentBranch(ExitingBlock))
210 return false;
211 }
212
213 FromCycle = FromCycle->getParentCycle();
214 }
215 }
216 }
217
218 return true;
219}
220
222 int64_t &Offset0,
223 int64_t &Offset1) const {
224 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
225 return false;
226
227 unsigned Opc0 = Load0->getMachineOpcode();
228 unsigned Opc1 = Load1->getMachineOpcode();
229
230 // Make sure both are actually loads.
231 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
232 return false;
233
234 // A mayLoad instruction without a def is not a load. Likely a prefetch.
235 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
236 return false;
237
238 if (isDS(Opc0) && isDS(Opc1)) {
239
240 // FIXME: Handle this case:
241 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
242 return false;
243
244 // Check base reg.
245 if (Load0->getOperand(0) != Load1->getOperand(0))
246 return false;
247
248 // Skip read2 / write2 variants for simplicity.
249 // TODO: We should report true if the used offsets are adjacent (excluded
250 // st64 versions).
251 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
252 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
253 if (Offset0Idx == -1 || Offset1Idx == -1)
254 return false;
255
256 // XXX - be careful of dataless loads
257 // getNamedOperandIdx returns the index for MachineInstrs. Since they
258 // include the output in the operand list, but SDNodes don't, we need to
259 // subtract the index by one.
260 Offset0Idx -= get(Opc0).NumDefs;
261 Offset1Idx -= get(Opc1).NumDefs;
262 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
263 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
264 return true;
265 }
266
267 if (isSMRD(Opc0) && isSMRD(Opc1)) {
268 // Skip time and cache invalidation instructions.
269 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
270 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
271 return false;
272
273 unsigned NumOps = getNumOperandsNoGlue(Load0);
274 if (NumOps != getNumOperandsNoGlue(Load1))
275 return false;
276
277 // Check base reg.
278 if (Load0->getOperand(0) != Load1->getOperand(0))
279 return false;
280
281 // Match register offsets, if both register and immediate offsets present.
282 assert(NumOps == 4 || NumOps == 5);
283 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
284 return false;
285
286 const ConstantSDNode *Load0Offset =
288 const ConstantSDNode *Load1Offset =
290
291 if (!Load0Offset || !Load1Offset)
292 return false;
293
294 Offset0 = Load0Offset->getZExtValue();
295 Offset1 = Load1Offset->getZExtValue();
296 return true;
297 }
298
299 // MUBUF and MTBUF can access the same addresses.
300 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
301
302 // MUBUF and MTBUF have vaddr at different indices.
303 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
304 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
305 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
306 return false;
307
308 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
309 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
310
311 if (OffIdx0 == -1 || OffIdx1 == -1)
312 return false;
313
314 // getNamedOperandIdx returns the index for MachineInstrs. Since they
315 // include the output in the operand list, but SDNodes don't, we need to
316 // subtract the index by one.
317 OffIdx0 -= get(Opc0).NumDefs;
318 OffIdx1 -= get(Opc1).NumDefs;
319
320 SDValue Off0 = Load0->getOperand(OffIdx0);
321 SDValue Off1 = Load1->getOperand(OffIdx1);
322
323 // The offset might be a FrameIndexSDNode.
324 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
325 return false;
326
327 Offset0 = Off0->getAsZExtVal();
328 Offset1 = Off1->getAsZExtVal();
329 return true;
330 }
331
332 return false;
333}
334
335static bool isStride64(unsigned Opc) {
336 switch (Opc) {
337 case AMDGPU::DS_READ2ST64_B32:
338 case AMDGPU::DS_READ2ST64_B64:
339 case AMDGPU::DS_WRITE2ST64_B32:
340 case AMDGPU::DS_WRITE2ST64_B64:
341 return true;
342 default:
343 return false;
344 }
345}
346
349 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
350 const TargetRegisterInfo *TRI) const {
351 if (!LdSt.mayLoadOrStore())
352 return false;
353
354 unsigned Opc = LdSt.getOpcode();
355 OffsetIsScalable = false;
356 const MachineOperand *BaseOp, *OffsetOp;
357 int DataOpIdx;
358
359 if (isDS(LdSt)) {
360 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
361 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
362 if (OffsetOp) {
363 // Normal, single offset LDS instruction.
364 if (!BaseOp) {
365 // DS_CONSUME/DS_APPEND use M0 for the base address.
366 // TODO: find the implicit use operand for M0 and use that as BaseOp?
367 return false;
368 }
369 BaseOps.push_back(BaseOp);
370 Offset = OffsetOp->getImm();
371 // Get appropriate operand, and compute width accordingly.
372 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
373 if (DataOpIdx == -1)
374 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
375 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
376 Width = LocationSize::precise(64);
377 else
378 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
379 } else {
380 // The 2 offset instructions use offset0 and offset1 instead. We can treat
381 // these as a load with a single offset if the 2 offsets are consecutive.
382 // We will use this for some partially aligned loads.
383 const MachineOperand *Offset0Op =
384 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
385 const MachineOperand *Offset1Op =
386 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
387
388 unsigned Offset0 = Offset0Op->getImm() & 0xff;
389 unsigned Offset1 = Offset1Op->getImm() & 0xff;
390 if (Offset0 + 1 != Offset1)
391 return false;
392
393 // Each of these offsets is in element sized units, so we need to convert
394 // to bytes of the individual reads.
395
396 unsigned EltSize;
397 if (LdSt.mayLoad())
398 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
399 else {
400 assert(LdSt.mayStore());
401 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
402 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
403 }
404
405 if (isStride64(Opc))
406 EltSize *= 64;
407
408 BaseOps.push_back(BaseOp);
409 Offset = EltSize * Offset0;
410 // Get appropriate operand(s), and compute width accordingly.
411 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
412 if (DataOpIdx == -1) {
413 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
414 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
415 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
416 Width = LocationSize::precise(
417 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
418 } else {
419 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
420 }
421 }
422 return true;
423 }
424
425 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
426 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
427 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
428 return false;
429 BaseOps.push_back(RSrc);
430 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
431 if (BaseOp && !BaseOp->isFI())
432 BaseOps.push_back(BaseOp);
433 const MachineOperand *OffsetImm =
434 getNamedOperand(LdSt, AMDGPU::OpName::offset);
435 Offset = OffsetImm->getImm();
436 const MachineOperand *SOffset =
437 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
438 if (SOffset) {
439 if (SOffset->isReg())
440 BaseOps.push_back(SOffset);
441 else
442 Offset += SOffset->getImm();
443 }
444 // Get appropriate operand, and compute width accordingly.
445 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
446 if (DataOpIdx == -1)
447 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
448 if (DataOpIdx == -1) // LDS DMA
449 return false;
450 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
451 return true;
452 }
453
454 if (isImage(LdSt)) {
455 auto RsrcOpName =
456 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
457 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
458 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
459 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
460 if (VAddr0Idx >= 0) {
461 // GFX10 possible NSA encoding.
462 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
463 BaseOps.push_back(&LdSt.getOperand(I));
464 } else {
465 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
466 }
467 Offset = 0;
468 // Get appropriate operand, and compute width accordingly.
469 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
470 if (DataOpIdx == -1)
471 return false; // no return sampler
472 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
473 return true;
474 }
475
476 if (isSMRD(LdSt)) {
477 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
478 if (!BaseOp) // e.g. S_MEMTIME
479 return false;
480 BaseOps.push_back(BaseOp);
481 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
482 Offset = OffsetOp ? OffsetOp->getImm() : 0;
483 // Get appropriate operand, and compute width accordingly.
484 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
485 if (DataOpIdx == -1)
486 return false;
487 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
488 return true;
489 }
490
491 if (isFLAT(LdSt)) {
492 // Instructions have either vaddr or saddr or both or none.
493 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
494 if (BaseOp)
495 BaseOps.push_back(BaseOp);
496 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
497 if (BaseOp)
498 BaseOps.push_back(BaseOp);
499 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
500 // Get appropriate operand, and compute width accordingly.
501 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
502 if (DataOpIdx == -1)
503 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
504 if (DataOpIdx == -1) // LDS DMA
505 return false;
506 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
507 return true;
508 }
509
510 return false;
511}
512
513static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
515 const MachineInstr &MI2,
517 // Only examine the first "base" operand of each instruction, on the
518 // assumption that it represents the real base address of the memory access.
519 // Other operands are typically offsets or indices from this base address.
520 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
521 return true;
522
523 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
524 return false;
525
526 auto *MO1 = *MI1.memoperands_begin();
527 auto *MO2 = *MI2.memoperands_begin();
528 if (MO1->getAddrSpace() != MO2->getAddrSpace())
529 return false;
530
531 const auto *Base1 = MO1->getValue();
532 const auto *Base2 = MO2->getValue();
533 if (!Base1 || !Base2)
534 return false;
535 Base1 = getUnderlyingObject(Base1);
536 Base2 = getUnderlyingObject(Base2);
537
538 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
539 return false;
540
541 return Base1 == Base2;
542}
543
545 int64_t Offset1, bool OffsetIsScalable1,
547 int64_t Offset2, bool OffsetIsScalable2,
548 unsigned ClusterSize,
549 unsigned NumBytes) const {
550 // If the mem ops (to be clustered) do not have the same base ptr, then they
551 // should not be clustered
552 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
553 if (!BaseOps1.empty() && !BaseOps2.empty()) {
554 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
555 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
556 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
557 return false;
558
559 const SIMachineFunctionInfo *MFI =
560 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
561 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
562 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
563 // If only one base op is empty, they do not have the same base ptr
564 return false;
565 }
566
567 // In order to avoid register pressure, on an average, the number of DWORDS
568 // loaded together by all clustered mem ops should not exceed
569 // MaxMemoryClusterDWords. This is an empirical value based on certain
570 // observations and performance related experiments.
571 // The good thing about this heuristic is - it avoids clustering of too many
572 // sub-word loads, and also avoids clustering of wide loads. Below is the
573 // brief summary of how the heuristic behaves for various `LoadSize` when
574 // MaxMemoryClusterDWords is 8.
575 //
576 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
577 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
578 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
579 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
580 // (5) LoadSize >= 17: do not cluster
581 const unsigned LoadSize = NumBytes / ClusterSize;
582 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
583 return NumDWords <= MaxMemoryClusterDWords;
584}
585
586// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
587// the first 16 loads will be interleaved with the stores, and the next 16 will
588// be clustered as expected. It should really split into 2 16 store batches.
589//
590// Loads are clustered until this returns false, rather than trying to schedule
591// groups of stores. This also means we have to deal with saying different
592// address space loads should be clustered, and ones which might cause bank
593// conflicts.
594//
595// This might be deprecated so it might not be worth that much effort to fix.
597 int64_t Offset0, int64_t Offset1,
598 unsigned NumLoads) const {
599 assert(Offset1 > Offset0 &&
600 "Second offset should be larger than first offset!");
601 // If we have less than 16 loads in a row, and the offsets are within 64
602 // bytes, then schedule together.
603
604 // A cacheline is 64 bytes (for global memory).
605 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
606}
607
610 const DebugLoc &DL, MCRegister DestReg,
611 MCRegister SrcReg, bool KillSrc,
612 const char *Msg = "illegal VGPR to SGPR copy") {
613 MachineFunction *MF = MBB.getParent();
614
616 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
617
618 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
619 .addReg(SrcReg, getKillRegState(KillSrc));
620}
621
622/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
623/// possible to have a direct copy in these cases on GFX908, so an intermediate
624/// VGPR copy is required.
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 RegScavenger &RS, bool RegsOverlap,
631 Register ImpDefSuperReg = Register(),
632 Register ImpUseSuperReg = Register()) {
633 assert((TII.getSubtarget().hasMAIInsts() &&
634 !TII.getSubtarget().hasGFX90AInsts()) &&
635 "Expected GFX908 subtarget.");
636
637 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
638 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
639 "Source register of the copy should be either an SGPR or an AGPR.");
640
641 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
642 "Destination register of the copy should be an AGPR.");
643
644 const SIRegisterInfo &RI = TII.getRegisterInfo();
645
646 // First try to find defining accvgpr_write to avoid temporary registers.
647 // In the case of copies of overlapping AGPRs, we conservatively do not
648 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
649 // an accvgpr_write used for this same copy due to implicit-defs
650 if (!RegsOverlap) {
651 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
652 --Def;
653
654 if (!Def->modifiesRegister(SrcReg, &RI))
655 continue;
656
657 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
658 Def->getOperand(0).getReg() != SrcReg)
659 break;
660
661 MachineOperand &DefOp = Def->getOperand(1);
662 assert(DefOp.isReg() || DefOp.isImm());
663
664 if (DefOp.isReg()) {
665 bool SafeToPropagate = true;
666 // Check that register source operand is not clobbered before MI.
667 // Immediate operands are always safe to propagate.
668 for (auto I = Def; I != MI && SafeToPropagate; ++I)
669 if (I->modifiesRegister(DefOp.getReg(), &RI))
670 SafeToPropagate = false;
671
672 if (!SafeToPropagate)
673 break;
674
675 for (auto I = Def; I != MI; ++I)
676 I->clearRegisterKills(DefOp.getReg(), &RI);
677 }
678
679 MachineInstrBuilder Builder =
680 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
681 .add(DefOp);
682 if (ImpDefSuperReg)
683 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
684
685 if (ImpUseSuperReg) {
686 Builder.addReg(ImpUseSuperReg,
688 }
689
690 return;
691 }
692 }
693
694 RS.enterBasicBlockEnd(MBB);
695 RS.backward(std::next(MI));
696
697 // Ideally we want to have three registers for a long reg_sequence copy
698 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
699 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
700 *MBB.getParent());
701
702 // Registers in the sequence are allocated contiguously so we can just
703 // use register number to pick one of three round-robin temps.
704 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
705 Register Tmp =
706 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
707 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
708 "VGPR used for an intermediate copy should have been reserved.");
709
710 // Only loop through if there are any free registers left. We don't want to
711 // spill.
712 while (RegNo--) {
713 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
714 /* RestoreAfter */ false, 0,
715 /* AllowSpill */ false);
716 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
717 break;
718 Tmp = Tmp2;
719 RS.setRegUsed(Tmp);
720 }
721
722 // Insert copy to temporary VGPR.
723 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
724 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
725 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
726 } else {
727 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
728 }
729
730 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
731 .addReg(SrcReg, getKillRegState(KillSrc));
732 if (ImpUseSuperReg) {
733 UseBuilder.addReg(ImpUseSuperReg,
735 }
736
737 MachineInstrBuilder DefBuilder
738 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
739 .addReg(Tmp, RegState::Kill);
740
741 if (ImpDefSuperReg)
742 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
743}
744
747 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
748 const TargetRegisterClass *RC, bool Forward) {
749 const SIRegisterInfo &RI = TII.getRegisterInfo();
750 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
752 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
753
754 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
755 int16_t SubIdx = BaseIndices[Idx];
756 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
757 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
758 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
759 unsigned Opcode = AMDGPU::S_MOV_B32;
760
761 // Is SGPR aligned? If so try to combine with next.
762 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
763 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
764 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
765 // Can use SGPR64 copy
766 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
767 SubIdx = RI.getSubRegFromChannel(Channel, 2);
768 DestSubReg = RI.getSubReg(DestReg, SubIdx);
769 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
770 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
771 Opcode = AMDGPU::S_MOV_B64;
772 Idx++;
773 }
774
775 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
776 .addReg(SrcSubReg)
777 .addReg(SrcReg, RegState::Implicit);
778
779 if (!FirstMI)
780 FirstMI = LastMI;
781
782 if (!Forward)
783 I--;
784 }
785
786 assert(FirstMI && LastMI);
787 if (!Forward)
788 std::swap(FirstMI, LastMI);
789
790 FirstMI->addOperand(
791 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
792
793 if (KillSrc)
794 LastMI->addRegisterKilled(SrcReg, &RI);
795}
796
799 const DebugLoc &DL, Register DestReg,
800 Register SrcReg, bool KillSrc, bool RenamableDest,
801 bool RenamableSrc) const {
802 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
803 unsigned Size = RI.getRegSizeInBits(*RC);
804 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
805 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
806
807 // The rest of copyPhysReg assumes Src and Dst size are the same size.
808 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
809 // we remove Fix16BitCopies and this code block?
810 if (Fix16BitCopies) {
811 if (((Size == 16) != (SrcSize == 16))) {
812 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
813 assert(ST.useRealTrue16Insts());
814 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
815 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
816 RegToFix = SubReg;
817
818 if (DestReg == SrcReg) {
819 // Identity copy. Insert empty bundle since ExpandPostRA expects an
820 // instruction here.
821 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
822 return;
823 }
824 RC = RI.getPhysRegBaseClass(DestReg);
825 Size = RI.getRegSizeInBits(*RC);
826 SrcRC = RI.getPhysRegBaseClass(SrcReg);
827 SrcSize = RI.getRegSizeInBits(*SrcRC);
828 }
829 }
830
831 if (RC == &AMDGPU::VGPR_32RegClass) {
832 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
833 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
834 AMDGPU::AGPR_32RegClass.contains(SrcReg));
835 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
836 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
837 BuildMI(MBB, MI, DL, get(Opc), DestReg)
838 .addReg(SrcReg, getKillRegState(KillSrc));
839 return;
840 }
841
842 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
843 RC == &AMDGPU::SReg_32RegClass) {
844 if (SrcReg == AMDGPU::SCC) {
845 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
846 .addImm(1)
847 .addImm(0);
848 return;
849 }
850
851 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
852 if (DestReg == AMDGPU::VCC_LO) {
853 // FIXME: Hack until VReg_1 removed.
854 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
855 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
856 .addImm(0)
857 .addReg(SrcReg, getKillRegState(KillSrc));
858 return;
859 }
860
861 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
862 return;
863 }
864
865 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
866 .addReg(SrcReg, getKillRegState(KillSrc));
867 return;
868 }
869
870 if (RC == &AMDGPU::SReg_64RegClass) {
871 if (SrcReg == AMDGPU::SCC) {
872 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
873 .addImm(1)
874 .addImm(0);
875 return;
876 }
877
878 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
879 if (DestReg == AMDGPU::VCC) {
880 // FIXME: Hack until VReg_1 removed.
881 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
882 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
883 .addImm(0)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 return;
886 }
887
888 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
889 return;
890 }
891
892 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
893 .addReg(SrcReg, getKillRegState(KillSrc));
894 return;
895 }
896
897 if (DestReg == AMDGPU::SCC) {
898 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
899 // but SelectionDAG emits such copies for i1 sources.
900 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
901 // This copy can only be produced by patterns
902 // with explicit SCC, which are known to be enabled
903 // only for subtargets with S_CMP_LG_U64 present.
904 assert(ST.hasScalarCompareEq64());
905 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
906 .addReg(SrcReg, getKillRegState(KillSrc))
907 .addImm(0);
908 } else {
909 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
910 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
911 .addReg(SrcReg, getKillRegState(KillSrc))
912 .addImm(0);
913 }
914
915 return;
916 }
917
918 if (RC == &AMDGPU::AGPR_32RegClass) {
919 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
920 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
921 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
922 .addReg(SrcReg, getKillRegState(KillSrc));
923 return;
924 }
925
926 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
927 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
928 .addReg(SrcReg, getKillRegState(KillSrc));
929 return;
930 }
931
932 // FIXME: Pass should maintain scavenger to avoid scan through the block on
933 // every AGPR spill.
934 RegScavenger RS;
935 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
936 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
937 return;
938 }
939
940 if (Size == 16) {
941 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
942 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
943 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
944
945 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
946 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
947 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
948 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
949 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
950 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
951 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
952 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
953
954 if (IsSGPRDst) {
955 if (!IsSGPRSrc) {
956 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
957 return;
958 }
959
960 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
961 .addReg(NewSrcReg, getKillRegState(KillSrc));
962 return;
963 }
964
965 if (IsAGPRDst || IsAGPRSrc) {
966 if (!DstLow || !SrcLow) {
967 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
968 "Cannot use hi16 subreg with an AGPR!");
969 }
970
971 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
972 return;
973 }
974
975 if (ST.useRealTrue16Insts()) {
976 if (IsSGPRSrc) {
977 assert(SrcLow);
978 SrcReg = NewSrcReg;
979 }
980 // Use the smaller instruction encoding if possible.
981 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
982 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
983 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
984 .addReg(SrcReg);
985 } else {
986 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
987 .addImm(0) // src0_modifiers
988 .addReg(SrcReg)
989 .addImm(0); // op_sel
990 }
991 return;
992 }
993
994 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
995 if (!DstLow || !SrcLow) {
996 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
997 "Cannot use hi16 subreg on VI!");
998 }
999
1000 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1001 .addReg(NewSrcReg, getKillRegState(KillSrc));
1002 return;
1003 }
1004
1005 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1006 .addImm(0) // src0_modifiers
1007 .addReg(NewSrcReg)
1008 .addImm(0) // clamp
1015 // First implicit operand is $exec.
1016 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1017 return;
1018 }
1019
1020 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1021 if (ST.hasVMovB64Inst()) {
1022 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1023 .addReg(SrcReg, getKillRegState(KillSrc));
1024 return;
1025 }
1026 if (ST.hasPkMovB32()) {
1027 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1029 .addReg(SrcReg)
1031 .addReg(SrcReg)
1032 .addImm(0) // op_sel_lo
1033 .addImm(0) // op_sel_hi
1034 .addImm(0) // neg_lo
1035 .addImm(0) // neg_hi
1036 .addImm(0) // clamp
1037 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1038 return;
1039 }
1040 }
1041
1042 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1043 if (RI.isSGPRClass(RC)) {
1044 if (!RI.isSGPRClass(SrcRC)) {
1045 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1046 return;
1047 }
1048 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1049 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1050 Forward);
1051 return;
1052 }
1053
1054 unsigned EltSize = 4;
1055 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1056 if (RI.isAGPRClass(RC)) {
1057 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1058 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1059 else if (RI.hasVGPRs(SrcRC) ||
1060 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1061 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1062 else
1063 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1064 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1065 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1066 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1067 (RI.isProperlyAlignedRC(*RC) &&
1068 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1069 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1070 if (ST.hasVMovB64Inst()) {
1071 Opcode = AMDGPU::V_MOV_B64_e32;
1072 EltSize = 8;
1073 } else if (ST.hasPkMovB32()) {
1074 Opcode = AMDGPU::V_PK_MOV_B32;
1075 EltSize = 8;
1076 }
1077 }
1078
1079 // For the cases where we need an intermediate instruction/temporary register
1080 // (destination is an AGPR), we need a scavenger.
1081 //
1082 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1083 // whole block for every handled copy.
1084 std::unique_ptr<RegScavenger> RS;
1085 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1086 RS = std::make_unique<RegScavenger>();
1087
1088 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1089
1090 // If there is an overlap, we can't kill the super-register on the last
1091 // instruction, since it will also kill the components made live by this def.
1092 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1093 const bool CanKillSuperReg = KillSrc && !Overlap;
1094
1095 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1096 unsigned SubIdx;
1097 if (Forward)
1098 SubIdx = SubIndices[Idx];
1099 else
1100 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1101 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1102 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1103 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1104
1105 bool IsFirstSubreg = Idx == 0;
1106 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1107
1108 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1109 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1110 Register ImpUseSuper = SrcReg;
1111 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1112 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1113 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1115 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1117 .addReg(SrcSubReg)
1119 .addReg(SrcSubReg)
1120 .addImm(0) // op_sel_lo
1121 .addImm(0) // op_sel_hi
1122 .addImm(0) // neg_lo
1123 .addImm(0) // neg_hi
1124 .addImm(0) // clamp
1125 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1126 if (IsFirstSubreg)
1128 } else {
1129 MachineInstrBuilder Builder =
1130 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1131 if (IsFirstSubreg)
1132 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1133
1134 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1135 }
1136 }
1137}
1138
1139int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1140 int32_t NewOpc;
1141
1142 // Try to map original to commuted opcode
1143 NewOpc = AMDGPU::getCommuteRev(Opcode);
1144 if (NewOpc != -1)
1145 // Check if the commuted (REV) opcode exists on the target.
1146 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1147
1148 // Try to map commuted to original opcode
1149 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1150 if (NewOpc != -1)
1151 // Check if the original (non-REV) opcode exists on the target.
1152 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1153
1154 return Opcode;
1155}
1156
1157const TargetRegisterClass *
1159 return &AMDGPU::VGPR_32RegClass;
1160}
1161
1164 const DebugLoc &DL, Register DstReg,
1166 Register TrueReg,
1167 Register FalseReg) const {
1168 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1169 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1171 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1172 "Not a VGPR32 reg");
1173
1174 if (Cond.size() == 1) {
1175 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1176 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1177 .add(Cond[0]);
1178 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1179 .addImm(0)
1180 .addReg(FalseReg)
1181 .addImm(0)
1182 .addReg(TrueReg)
1183 .addReg(SReg);
1184 } else if (Cond.size() == 2) {
1185 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1186 switch (Cond[0].getImm()) {
1187 case SIInstrInfo::SCC_TRUE: {
1188 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1189 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1190 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1191 .addImm(0)
1192 .addReg(FalseReg)
1193 .addImm(0)
1194 .addReg(TrueReg)
1195 .addReg(SReg);
1196 break;
1197 }
1198 case SIInstrInfo::SCC_FALSE: {
1199 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1200 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1201 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1202 .addImm(0)
1203 .addReg(FalseReg)
1204 .addImm(0)
1205 .addReg(TrueReg)
1206 .addReg(SReg);
1207 break;
1208 }
1209 case SIInstrInfo::VCCNZ: {
1210 MachineOperand RegOp = Cond[1];
1211 RegOp.setImplicit(false);
1212 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1213 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1214 .add(RegOp);
1215 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1216 .addImm(0)
1217 .addReg(FalseReg)
1218 .addImm(0)
1219 .addReg(TrueReg)
1220 .addReg(SReg);
1221 break;
1222 }
1223 case SIInstrInfo::VCCZ: {
1224 MachineOperand RegOp = Cond[1];
1225 RegOp.setImplicit(false);
1226 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1227 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1228 .add(RegOp);
1229 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1230 .addImm(0)
1231 .addReg(TrueReg)
1232 .addImm(0)
1233 .addReg(FalseReg)
1234 .addReg(SReg);
1235 break;
1236 }
1237 case SIInstrInfo::EXECNZ: {
1238 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1239 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1240 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1241 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1242 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1243 .addImm(0)
1244 .addReg(FalseReg)
1245 .addImm(0)
1246 .addReg(TrueReg)
1247 .addReg(SReg);
1248 break;
1249 }
1250 case SIInstrInfo::EXECZ: {
1251 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1252 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1253 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1254 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1255 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1256 .addImm(0)
1257 .addReg(FalseReg)
1258 .addImm(0)
1259 .addReg(TrueReg)
1260 .addReg(SReg);
1261 llvm_unreachable("Unhandled branch predicate EXECZ");
1262 break;
1263 }
1264 default:
1265 llvm_unreachable("invalid branch predicate");
1266 }
1267 } else {
1268 llvm_unreachable("Can only handle Cond size 1 or 2");
1269 }
1270}
1271
1274 const DebugLoc &DL,
1275 Register SrcReg, int Value) const {
1276 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1277 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1278 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1279 .addImm(Value)
1280 .addReg(SrcReg);
1281
1282 return Reg;
1283}
1284
1287 const DebugLoc &DL,
1288 Register SrcReg, int Value) const {
1289 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1290 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1291 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1292 .addImm(Value)
1293 .addReg(SrcReg);
1294
1295 return Reg;
1296}
1297
1299 const Register Reg,
1300 int64_t &ImmVal) const {
1301 switch (MI.getOpcode()) {
1302 case AMDGPU::V_MOV_B32_e32:
1303 case AMDGPU::S_MOV_B32:
1304 case AMDGPU::S_MOVK_I32:
1305 case AMDGPU::S_MOV_B64:
1306 case AMDGPU::V_MOV_B64_e32:
1307 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1308 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1309 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1310 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1311 case AMDGPU::V_MOV_B64_PSEUDO:
1312 case AMDGPU::V_MOV_B16_t16_e32: {
1313 const MachineOperand &Src0 = MI.getOperand(1);
1314 if (Src0.isImm()) {
1315 ImmVal = Src0.getImm();
1316 return MI.getOperand(0).getReg() == Reg;
1317 }
1318
1319 return false;
1320 }
1321 case AMDGPU::V_MOV_B16_t16_e64: {
1322 const MachineOperand &Src0 = MI.getOperand(2);
1323 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1324 ImmVal = Src0.getImm();
1325 return MI.getOperand(0).getReg() == Reg;
1326 }
1327
1328 return false;
1329 }
1330 case AMDGPU::S_BREV_B32:
1331 case AMDGPU::V_BFREV_B32_e32:
1332 case AMDGPU::V_BFREV_B32_e64: {
1333 const MachineOperand &Src0 = MI.getOperand(1);
1334 if (Src0.isImm()) {
1335 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1336 return MI.getOperand(0).getReg() == Reg;
1337 }
1338
1339 return false;
1340 }
1341 case AMDGPU::S_NOT_B32:
1342 case AMDGPU::V_NOT_B32_e32:
1343 case AMDGPU::V_NOT_B32_e64: {
1344 const MachineOperand &Src0 = MI.getOperand(1);
1345 if (Src0.isImm()) {
1346 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1347 return MI.getOperand(0).getReg() == Reg;
1348 }
1349
1350 return false;
1351 }
1352 default:
1353 return false;
1354 }
1355}
1356
1357std::optional<int64_t>
1359 if (Op.isImm())
1360 return Op.getImm();
1361
1362 if (!Op.isReg() || !Op.getReg().isVirtual())
1363 return std::nullopt;
1364 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1365 const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1366 if (Def && Def->isMoveImmediate()) {
1367 const MachineOperand &ImmSrc = Def->getOperand(1);
1368 if (ImmSrc.isImm())
1369 return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1370 }
1371
1372 return std::nullopt;
1373}
1374
1376
1377 if (RI.isAGPRClass(DstRC))
1378 return AMDGPU::COPY;
1379 if (RI.getRegSizeInBits(*DstRC) == 16) {
1380 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1381 // before RA.
1382 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1383 }
1384 if (RI.getRegSizeInBits(*DstRC) == 32)
1385 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1386 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1387 return AMDGPU::S_MOV_B64;
1388 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1389 return AMDGPU::V_MOV_B64_PSEUDO;
1390 return AMDGPU::COPY;
1391}
1392
1393const MCInstrDesc &
1395 bool IsIndirectSrc) const {
1396 if (IsIndirectSrc) {
1397 if (VecSize <= 32) // 4 bytes
1398 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1399 if (VecSize <= 64) // 8 bytes
1400 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1401 if (VecSize <= 96) // 12 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1403 if (VecSize <= 128) // 16 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1405 if (VecSize <= 160) // 20 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1407 if (VecSize <= 192) // 24 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1409 if (VecSize <= 224) // 28 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1411 if (VecSize <= 256) // 32 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1413 if (VecSize <= 288) // 36 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1415 if (VecSize <= 320) // 40 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1417 if (VecSize <= 352) // 44 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1419 if (VecSize <= 384) // 48 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1421 if (VecSize <= 512) // 64 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1423 if (VecSize <= 1024) // 128 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1425
1426 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1427 }
1428
1429 if (VecSize <= 32) // 4 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1431 if (VecSize <= 64) // 8 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1433 if (VecSize <= 96) // 12 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1435 if (VecSize <= 128) // 16 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1437 if (VecSize <= 160) // 20 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1439 if (VecSize <= 192) // 24 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1441 if (VecSize <= 224) // 28 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1443 if (VecSize <= 256) // 32 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1445 if (VecSize <= 288) // 36 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1447 if (VecSize <= 320) // 40 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1449 if (VecSize <= 352) // 44 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1451 if (VecSize <= 384) // 48 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1453 if (VecSize <= 512) // 64 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1455 if (VecSize <= 1024) // 128 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1457
1458 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1459}
1460
1461static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1462 if (VecSize <= 32) // 4 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1464 if (VecSize <= 64) // 8 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1466 if (VecSize <= 96) // 12 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1468 if (VecSize <= 128) // 16 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1470 if (VecSize <= 160) // 20 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1472 if (VecSize <= 192) // 24 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1474 if (VecSize <= 224) // 28 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1476 if (VecSize <= 256) // 32 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1478 if (VecSize <= 288) // 36 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1480 if (VecSize <= 320) // 40 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1482 if (VecSize <= 352) // 44 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1484 if (VecSize <= 384) // 48 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1486 if (VecSize <= 512) // 64 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1488 if (VecSize <= 1024) // 128 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1490
1491 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1492}
1493
1494static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1495 if (VecSize <= 32) // 4 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1497 if (VecSize <= 64) // 8 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1499 if (VecSize <= 96) // 12 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1501 if (VecSize <= 128) // 16 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1503 if (VecSize <= 160) // 20 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1505 if (VecSize <= 192) // 24 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1507 if (VecSize <= 224) // 28 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1509 if (VecSize <= 256) // 32 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1511 if (VecSize <= 288) // 36 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1513 if (VecSize <= 320) // 40 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1515 if (VecSize <= 352) // 44 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1517 if (VecSize <= 384) // 48 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1519 if (VecSize <= 512) // 64 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1521 if (VecSize <= 1024) // 128 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1523
1524 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1525}
1526
1527static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1528 if (VecSize <= 64) // 8 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1530 if (VecSize <= 128) // 16 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1532 if (VecSize <= 256) // 32 bytes
1533 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1534 if (VecSize <= 512) // 64 bytes
1535 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1536 if (VecSize <= 1024) // 128 bytes
1537 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1538
1539 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1540}
1541
1542const MCInstrDesc &
1543SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1544 bool IsSGPR) const {
1545 if (IsSGPR) {
1546 switch (EltSize) {
1547 case 32:
1548 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1549 case 64:
1550 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1551 default:
1552 llvm_unreachable("invalid reg indexing elt size");
1553 }
1554 }
1555
1556 assert(EltSize == 32 && "invalid reg indexing elt size");
1558}
1559
1560static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1561 switch (Size) {
1562 case 4:
1563 return AMDGPU::SI_SPILL_S32_SAVE;
1564 case 8:
1565 return AMDGPU::SI_SPILL_S64_SAVE;
1566 case 12:
1567 return AMDGPU::SI_SPILL_S96_SAVE;
1568 case 16:
1569 return AMDGPU::SI_SPILL_S128_SAVE;
1570 case 20:
1571 return AMDGPU::SI_SPILL_S160_SAVE;
1572 case 24:
1573 return AMDGPU::SI_SPILL_S192_SAVE;
1574 case 28:
1575 return AMDGPU::SI_SPILL_S224_SAVE;
1576 case 32:
1577 return AMDGPU::SI_SPILL_S256_SAVE;
1578 case 36:
1579 return AMDGPU::SI_SPILL_S288_SAVE;
1580 case 40:
1581 return AMDGPU::SI_SPILL_S320_SAVE;
1582 case 44:
1583 return AMDGPU::SI_SPILL_S352_SAVE;
1584 case 48:
1585 return AMDGPU::SI_SPILL_S384_SAVE;
1586 case 64:
1587 return AMDGPU::SI_SPILL_S512_SAVE;
1588 case 128:
1589 return AMDGPU::SI_SPILL_S1024_SAVE;
1590 default:
1591 llvm_unreachable("unknown register size");
1592 }
1593}
1594
1595static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1596 switch (Size) {
1597 case 2:
1598 return AMDGPU::SI_SPILL_V16_SAVE;
1599 case 4:
1600 return AMDGPU::SI_SPILL_V32_SAVE;
1601 case 8:
1602 return AMDGPU::SI_SPILL_V64_SAVE;
1603 case 12:
1604 return AMDGPU::SI_SPILL_V96_SAVE;
1605 case 16:
1606 return AMDGPU::SI_SPILL_V128_SAVE;
1607 case 20:
1608 return AMDGPU::SI_SPILL_V160_SAVE;
1609 case 24:
1610 return AMDGPU::SI_SPILL_V192_SAVE;
1611 case 28:
1612 return AMDGPU::SI_SPILL_V224_SAVE;
1613 case 32:
1614 return AMDGPU::SI_SPILL_V256_SAVE;
1615 case 36:
1616 return AMDGPU::SI_SPILL_V288_SAVE;
1617 case 40:
1618 return AMDGPU::SI_SPILL_V320_SAVE;
1619 case 44:
1620 return AMDGPU::SI_SPILL_V352_SAVE;
1621 case 48:
1622 return AMDGPU::SI_SPILL_V384_SAVE;
1623 case 64:
1624 return AMDGPU::SI_SPILL_V512_SAVE;
1625 case 128:
1626 return AMDGPU::SI_SPILL_V1024_SAVE;
1627 default:
1628 llvm_unreachable("unknown register size");
1629 }
1630}
1631
1632static unsigned getAVSpillSaveOpcode(unsigned Size) {
1633 switch (Size) {
1634 case 4:
1635 return AMDGPU::SI_SPILL_AV32_SAVE;
1636 case 8:
1637 return AMDGPU::SI_SPILL_AV64_SAVE;
1638 case 12:
1639 return AMDGPU::SI_SPILL_AV96_SAVE;
1640 case 16:
1641 return AMDGPU::SI_SPILL_AV128_SAVE;
1642 case 20:
1643 return AMDGPU::SI_SPILL_AV160_SAVE;
1644 case 24:
1645 return AMDGPU::SI_SPILL_AV192_SAVE;
1646 case 28:
1647 return AMDGPU::SI_SPILL_AV224_SAVE;
1648 case 32:
1649 return AMDGPU::SI_SPILL_AV256_SAVE;
1650 case 36:
1651 return AMDGPU::SI_SPILL_AV288_SAVE;
1652 case 40:
1653 return AMDGPU::SI_SPILL_AV320_SAVE;
1654 case 44:
1655 return AMDGPU::SI_SPILL_AV352_SAVE;
1656 case 48:
1657 return AMDGPU::SI_SPILL_AV384_SAVE;
1658 case 64:
1659 return AMDGPU::SI_SPILL_AV512_SAVE;
1660 case 128:
1661 return AMDGPU::SI_SPILL_AV1024_SAVE;
1662 default:
1663 llvm_unreachable("unknown register size");
1664 }
1665}
1666
1667static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1668 bool IsVectorSuperClass) {
1669 // Currently, there is only 32-bit WWM register spills needed.
1670 if (Size != 4)
1671 llvm_unreachable("unknown wwm register spill size");
1672
1673 if (IsVectorSuperClass)
1674 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1675
1676 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1677}
1678
1680 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1681 const SIMachineFunctionInfo &MFI) const {
1682 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1683
1684 // Choose the right opcode if spilling a WWM register.
1686 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1687
1688 // TODO: Check if AGPRs are available
1689 if (ST.hasMAIInsts())
1690 return getAVSpillSaveOpcode(Size);
1691
1693}
1694
1697 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1698 MachineInstr::MIFlag Flags) const {
1699 MachineFunction *MF = MBB.getParent();
1701 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1702 const DebugLoc &DL = MBB.findDebugLoc(MI);
1703
1704 MachinePointerInfo PtrInfo
1705 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1707 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1708 FrameInfo.getObjectAlign(FrameIndex));
1709 unsigned SpillSize = RI.getSpillSize(*RC);
1710
1711 MachineRegisterInfo &MRI = MF->getRegInfo();
1712 if (RI.isSGPRClass(RC)) {
1713 MFI->setHasSpilledSGPRs();
1714 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1715 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1716 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1717
1718 // We are only allowed to create one new instruction when spilling
1719 // registers, so we need to use pseudo instruction for spilling SGPRs.
1720 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1721
1722 // The SGPR spill/restore instructions only work on number sgprs, so we need
1723 // to make sure we are using the correct register class.
1724 if (SrcReg.isVirtual() && SpillSize == 4) {
1725 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1726 }
1727
1728 BuildMI(MBB, MI, DL, OpDesc)
1729 .addReg(SrcReg, getKillRegState(isKill)) // data
1730 .addFrameIndex(FrameIndex) // addr
1731 .addMemOperand(MMO)
1733
1734 if (RI.spillSGPRToVGPR())
1735 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1736 return;
1737 }
1738
1739 unsigned Opcode =
1740 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1741 MFI->setHasSpilledVGPRs();
1742
1743 BuildMI(MBB, MI, DL, get(Opcode))
1744 .addReg(SrcReg, getKillRegState(isKill)) // data
1745 .addFrameIndex(FrameIndex) // addr
1746 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1747 .addImm(0) // offset
1748 .addMemOperand(MMO);
1749}
1750
1751static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1752 switch (Size) {
1753 case 4:
1754 return AMDGPU::SI_SPILL_S32_RESTORE;
1755 case 8:
1756 return AMDGPU::SI_SPILL_S64_RESTORE;
1757 case 12:
1758 return AMDGPU::SI_SPILL_S96_RESTORE;
1759 case 16:
1760 return AMDGPU::SI_SPILL_S128_RESTORE;
1761 case 20:
1762 return AMDGPU::SI_SPILL_S160_RESTORE;
1763 case 24:
1764 return AMDGPU::SI_SPILL_S192_RESTORE;
1765 case 28:
1766 return AMDGPU::SI_SPILL_S224_RESTORE;
1767 case 32:
1768 return AMDGPU::SI_SPILL_S256_RESTORE;
1769 case 36:
1770 return AMDGPU::SI_SPILL_S288_RESTORE;
1771 case 40:
1772 return AMDGPU::SI_SPILL_S320_RESTORE;
1773 case 44:
1774 return AMDGPU::SI_SPILL_S352_RESTORE;
1775 case 48:
1776 return AMDGPU::SI_SPILL_S384_RESTORE;
1777 case 64:
1778 return AMDGPU::SI_SPILL_S512_RESTORE;
1779 case 128:
1780 return AMDGPU::SI_SPILL_S1024_RESTORE;
1781 default:
1782 llvm_unreachable("unknown register size");
1783 }
1784}
1785
1786static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1787 switch (Size) {
1788 case 2:
1789 return AMDGPU::SI_SPILL_V16_RESTORE;
1790 case 4:
1791 return AMDGPU::SI_SPILL_V32_RESTORE;
1792 case 8:
1793 return AMDGPU::SI_SPILL_V64_RESTORE;
1794 case 12:
1795 return AMDGPU::SI_SPILL_V96_RESTORE;
1796 case 16:
1797 return AMDGPU::SI_SPILL_V128_RESTORE;
1798 case 20:
1799 return AMDGPU::SI_SPILL_V160_RESTORE;
1800 case 24:
1801 return AMDGPU::SI_SPILL_V192_RESTORE;
1802 case 28:
1803 return AMDGPU::SI_SPILL_V224_RESTORE;
1804 case 32:
1805 return AMDGPU::SI_SPILL_V256_RESTORE;
1806 case 36:
1807 return AMDGPU::SI_SPILL_V288_RESTORE;
1808 case 40:
1809 return AMDGPU::SI_SPILL_V320_RESTORE;
1810 case 44:
1811 return AMDGPU::SI_SPILL_V352_RESTORE;
1812 case 48:
1813 return AMDGPU::SI_SPILL_V384_RESTORE;
1814 case 64:
1815 return AMDGPU::SI_SPILL_V512_RESTORE;
1816 case 128:
1817 return AMDGPU::SI_SPILL_V1024_RESTORE;
1818 default:
1819 llvm_unreachable("unknown register size");
1820 }
1821}
1822
1823static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1824 switch (Size) {
1825 case 4:
1826 return AMDGPU::SI_SPILL_AV32_RESTORE;
1827 case 8:
1828 return AMDGPU::SI_SPILL_AV64_RESTORE;
1829 case 12:
1830 return AMDGPU::SI_SPILL_AV96_RESTORE;
1831 case 16:
1832 return AMDGPU::SI_SPILL_AV128_RESTORE;
1833 case 20:
1834 return AMDGPU::SI_SPILL_AV160_RESTORE;
1835 case 24:
1836 return AMDGPU::SI_SPILL_AV192_RESTORE;
1837 case 28:
1838 return AMDGPU::SI_SPILL_AV224_RESTORE;
1839 case 32:
1840 return AMDGPU::SI_SPILL_AV256_RESTORE;
1841 case 36:
1842 return AMDGPU::SI_SPILL_AV288_RESTORE;
1843 case 40:
1844 return AMDGPU::SI_SPILL_AV320_RESTORE;
1845 case 44:
1846 return AMDGPU::SI_SPILL_AV352_RESTORE;
1847 case 48:
1848 return AMDGPU::SI_SPILL_AV384_RESTORE;
1849 case 64:
1850 return AMDGPU::SI_SPILL_AV512_RESTORE;
1851 case 128:
1852 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1853 default:
1854 llvm_unreachable("unknown register size");
1855 }
1856}
1857
1858static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1859 bool IsVectorSuperClass) {
1860 // Currently, there is only 32-bit WWM register spills needed.
1861 if (Size != 4)
1862 llvm_unreachable("unknown wwm register spill size");
1863
1864 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1865 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1866
1867 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1868}
1869
1871 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1872 const SIMachineFunctionInfo &MFI) const {
1873 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1874
1875 // Choose the right opcode if restoring a WWM register.
1877 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1878
1879 // TODO: Check if AGPRs are available
1880 if (ST.hasMAIInsts())
1882
1883 assert(!RI.isAGPRClass(RC));
1885}
1886
1889 Register DestReg, int FrameIndex,
1890 const TargetRegisterClass *RC,
1891 Register VReg, unsigned SubReg,
1892 MachineInstr::MIFlag Flags) const {
1893 MachineFunction *MF = MBB.getParent();
1895 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1896 const DebugLoc &DL = MBB.findDebugLoc(MI);
1897 unsigned SpillSize = RI.getSpillSize(*RC);
1898
1899 MachinePointerInfo PtrInfo
1900 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1901
1903 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1904 FrameInfo.getObjectAlign(FrameIndex));
1905
1906 if (RI.isSGPRClass(RC)) {
1907 MFI->setHasSpilledSGPRs();
1908 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1909 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1910 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1911
1912 // FIXME: Maybe this should not include a memoperand because it will be
1913 // lowered to non-memory instructions.
1914 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1915 if (DestReg.isVirtual() && SpillSize == 4) {
1916 MachineRegisterInfo &MRI = MF->getRegInfo();
1917 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1918 }
1919
1920 if (RI.spillSGPRToVGPR())
1921 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1922 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1923 .addFrameIndex(FrameIndex) // addr
1924 .addMemOperand(MMO)
1926
1927 return;
1928 }
1929
1930 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1931 SpillSize, *MFI);
1932 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1933 .addFrameIndex(FrameIndex) // vaddr
1934 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1935 .addImm(0) // offset
1936 .addMemOperand(MMO);
1937}
1938
1943
1946 unsigned Quantity) const {
1947 DebugLoc DL = MBB.findDebugLoc(MI);
1948 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1949 while (Quantity > 0) {
1950 unsigned Arg = std::min(Quantity, MaxSNopCount);
1951 Quantity -= Arg;
1952 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1953 }
1954}
1955
1957 auto *MF = MBB.getParent();
1958 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1959
1960 assert(Info->isEntryFunction());
1961
1962 if (MBB.succ_empty()) {
1963 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1964 if (HasNoTerminator) {
1965 if (Info->returnsVoid()) {
1966 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1967 } else {
1968 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1969 }
1970 }
1971 }
1972}
1973
1977 const DebugLoc &DL) const {
1978 MachineFunction *MF = MBB.getParent();
1979 constexpr unsigned DoorbellIDMask = 0x3ff;
1980 constexpr unsigned ECQueueWaveAbort = 0x400;
1981
1982 MachineBasicBlock *TrapBB = &MBB;
1983 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1984
1985 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1986 MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1987 TrapBB = MF->CreateMachineBasicBlock();
1988 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1989 MF->push_back(TrapBB);
1990 MBB.addSuccessor(TrapBB);
1991 }
1992 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1993 // will be a nop.
1994 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1995 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1996 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1997 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1998 DoorbellReg)
2000 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2001 .addUse(AMDGPU::M0);
2002 Register DoorbellRegMasked =
2003 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2004 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2005 .addUse(DoorbellReg)
2006 .addImm(DoorbellIDMask);
2007 Register SetWaveAbortBit =
2008 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2009 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2010 .addUse(DoorbellRegMasked)
2011 .addImm(ECQueueWaveAbort);
2012 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2013 .addUse(SetWaveAbortBit);
2014 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2016 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2017 .addUse(AMDGPU::TTMP2);
2018 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2019 TrapBB->addSuccessor(HaltLoopBB);
2020
2021 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2022 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2023 .addMBB(HaltLoopBB);
2024 MF->push_back(HaltLoopBB);
2025 HaltLoopBB->addSuccessor(HaltLoopBB);
2026
2027 return MBB.getNextNode();
2028}
2029
2031 switch (MI.getOpcode()) {
2032 default:
2033 if (MI.isMetaInstruction())
2034 return 0;
2035 return 1; // FIXME: Do wait states equal cycles?
2036
2037 case AMDGPU::S_NOP:
2038 return MI.getOperand(0).getImm() + 1;
2039 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2040 // hazard, even if one exist, won't really be visible. Should we handle it?
2041 }
2042}
2043
2045 MachineBasicBlock &MBB = *MI.getParent();
2046 DebugLoc DL = MBB.findDebugLoc(MI);
2048 switch (MI.getOpcode()) {
2049 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2050 case AMDGPU::S_MOV_B64_term:
2051 // This is only a terminator to get the correct spill code placement during
2052 // register allocation.
2053 MI.setDesc(get(AMDGPU::S_MOV_B64));
2054 break;
2055
2056 case AMDGPU::S_MOV_B32_term:
2057 // This is only a terminator to get the correct spill code placement during
2058 // register allocation.
2059 MI.setDesc(get(AMDGPU::S_MOV_B32));
2060 break;
2061
2062 case AMDGPU::S_XOR_B64_term:
2063 // This is only a terminator to get the correct spill code placement during
2064 // register allocation.
2065 MI.setDesc(get(AMDGPU::S_XOR_B64));
2066 break;
2067
2068 case AMDGPU::S_XOR_B32_term:
2069 // This is only a terminator to get the correct spill code placement during
2070 // register allocation.
2071 MI.setDesc(get(AMDGPU::S_XOR_B32));
2072 break;
2073 case AMDGPU::S_OR_B64_term:
2074 // This is only a terminator to get the correct spill code placement during
2075 // register allocation.
2076 MI.setDesc(get(AMDGPU::S_OR_B64));
2077 break;
2078 case AMDGPU::S_OR_B32_term:
2079 // This is only a terminator to get the correct spill code placement during
2080 // register allocation.
2081 MI.setDesc(get(AMDGPU::S_OR_B32));
2082 break;
2083
2084 case AMDGPU::S_ANDN2_B64_term:
2085 // This is only a terminator to get the correct spill code placement during
2086 // register allocation.
2087 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2088 break;
2089
2090 case AMDGPU::S_ANDN2_B32_term:
2091 // This is only a terminator to get the correct spill code placement during
2092 // register allocation.
2093 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2094 break;
2095
2096 case AMDGPU::S_AND_B64_term:
2097 // This is only a terminator to get the correct spill code placement during
2098 // register allocation.
2099 MI.setDesc(get(AMDGPU::S_AND_B64));
2100 break;
2101
2102 case AMDGPU::S_AND_B32_term:
2103 // This is only a terminator to get the correct spill code placement during
2104 // register allocation.
2105 MI.setDesc(get(AMDGPU::S_AND_B32));
2106 break;
2107
2108 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2109 // This is only a terminator to get the correct spill code placement during
2110 // register allocation.
2111 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2112 break;
2113
2114 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2115 // This is only a terminator to get the correct spill code placement during
2116 // register allocation.
2117 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2118 break;
2119
2120 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2121 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2122 break;
2123
2124 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2125 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2126 break;
2127 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2128 Register Dst = MI.getOperand(0).getReg();
2129 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2130 MI.setDesc(
2131 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2132 break;
2133 }
2134 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2135 Register Dst = MI.getOperand(0).getReg();
2136 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2137 int64_t Imm = MI.getOperand(1).getImm();
2138
2139 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2140 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2141 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2142 .addImm(SignExtend64<32>(Imm));
2143 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2144 .addImm(SignExtend64<32>(Imm >> 32));
2145 MI.eraseFromParent();
2146 break;
2147 }
2148
2149 [[fallthrough]];
2150 }
2151 case AMDGPU::V_MOV_B64_PSEUDO: {
2152 Register Dst = MI.getOperand(0).getReg();
2153 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2154 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2155
2156 const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
2157 const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
2158
2159 const MachineOperand &SrcOp = MI.getOperand(1);
2160 // FIXME: Will this work for 64-bit floating point immediates?
2161 assert(!SrcOp.isFPImm());
2162 if (ST.hasVMovB64Inst() && Mov64RC->contains(Dst)) {
2163 MI.setDesc(Mov64Desc);
2164 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2165 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2166 break;
2167 }
2168 if (SrcOp.isImm()) {
2169 APInt Imm(64, SrcOp.getImm());
2170 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2171 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2172 const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
2173 const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
2174
2175 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
2176 PkMovRC->contains(Dst)) {
2177 BuildMI(MBB, MI, DL, PkMovDesc, Dst)
2179 .addImm(Lo.getSExtValue())
2181 .addImm(Lo.getSExtValue())
2182 .addImm(0) // op_sel_lo
2183 .addImm(0) // op_sel_hi
2184 .addImm(0) // neg_lo
2185 .addImm(0) // neg_hi
2186 .addImm(0); // clamp
2187 } else {
2188 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2189 .addImm(Lo.getSExtValue());
2190 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2191 .addImm(Hi.getSExtValue());
2192 }
2193 } else {
2194 assert(SrcOp.isReg());
2195 if (ST.hasPkMovB32() &&
2196 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2197 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2198 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2199 .addReg(SrcOp.getReg())
2201 .addReg(SrcOp.getReg())
2202 .addImm(0) // op_sel_lo
2203 .addImm(0) // op_sel_hi
2204 .addImm(0) // neg_lo
2205 .addImm(0) // neg_hi
2206 .addImm(0); // clamp
2207 } else {
2208 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2209 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0));
2210 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2211 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1));
2212 }
2213 }
2214 MI.eraseFromParent();
2215 break;
2216 }
2217 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2219 break;
2220 }
2221 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2222 const MachineOperand &SrcOp = MI.getOperand(1);
2223 assert(!SrcOp.isFPImm());
2224
2225 if (ST.has64BitLiterals()) {
2226 MI.setDesc(get(AMDGPU::S_MOV_B64));
2227 break;
2228 }
2229
2230 APInt Imm(64, SrcOp.getImm());
2231 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2232 MI.setDesc(get(AMDGPU::S_MOV_B64));
2233 break;
2234 }
2235
2236 Register Dst = MI.getOperand(0).getReg();
2237 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2238 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2239
2240 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2241 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2242 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2243 .addImm(Lo.getSExtValue());
2244 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2245 .addImm(Hi.getSExtValue());
2246 MI.eraseFromParent();
2247 break;
2248 }
2249 case AMDGPU::V_SET_INACTIVE_B32: {
2250 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2251 Register DstReg = MI.getOperand(0).getReg();
2252 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2253 .add(MI.getOperand(3))
2254 .add(MI.getOperand(4))
2255 .add(MI.getOperand(1))
2256 .add(MI.getOperand(2))
2257 .add(MI.getOperand(5));
2258 MI.eraseFromParent();
2259 break;
2260 }
2261 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2262 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2263 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2264 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2265 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2266 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2267 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2268 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2269 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2270 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2271 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2272 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2273 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2274 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2275 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2276 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2280 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2281 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2282 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2283 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2284 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2285 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2286 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2287 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2288 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2289 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2290 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2291 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2292 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2293 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2294 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2295
2296 unsigned Opc;
2297 if (RI.hasVGPRs(EltRC)) {
2298 Opc = AMDGPU::V_MOVRELD_B32_e32;
2299 } else {
2300 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2301 : AMDGPU::S_MOVRELD_B32;
2302 }
2303
2304 const MCInstrDesc &OpDesc = get(Opc);
2305 Register VecReg = MI.getOperand(0).getReg();
2306 bool IsUndef = MI.getOperand(1).isUndef();
2307 unsigned SubReg = MI.getOperand(3).getImm();
2308 assert(VecReg == MI.getOperand(1).getReg());
2309
2311 BuildMI(MBB, MI, DL, OpDesc)
2312 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2313 .add(MI.getOperand(2))
2315 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2316
2317 const int ImpDefIdx =
2318 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2319 const int ImpUseIdx = ImpDefIdx + 1;
2320 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2321 MI.eraseFromParent();
2322 break;
2323 }
2324 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2325 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2326 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2327 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2328 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2329 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2330 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2331 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2332 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2333 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2334 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2335 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2336 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2337 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2338 assert(ST.useVGPRIndexMode());
2339 Register VecReg = MI.getOperand(0).getReg();
2340 bool IsUndef = MI.getOperand(1).isUndef();
2341 MachineOperand &Idx = MI.getOperand(3);
2342 Register SubReg = MI.getOperand(4).getImm();
2343
2344 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2345 .add(Idx)
2347 SetOn->getOperand(3).setIsUndef();
2348
2349 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2351 BuildMI(MBB, MI, DL, OpDesc)
2352 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2353 .add(MI.getOperand(2))
2355 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2356
2357 const int ImpDefIdx =
2358 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2359 const int ImpUseIdx = ImpDefIdx + 1;
2360 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2361
2362 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2363
2364 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2365
2366 MI.eraseFromParent();
2367 break;
2368 }
2369 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2370 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2371 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2372 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2373 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2374 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2375 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2376 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2377 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2378 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2379 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2380 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2381 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2382 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2383 assert(ST.useVGPRIndexMode());
2384 Register Dst = MI.getOperand(0).getReg();
2385 Register VecReg = MI.getOperand(1).getReg();
2386 bool IsUndef = MI.getOperand(1).isUndef();
2387 Register SubReg = MI.getOperand(3).getImm();
2388
2389 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2390 .add(MI.getOperand(2))
2392 SetOn->getOperand(3).setIsUndef();
2393
2394 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2395 .addDef(Dst)
2396 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2397 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2398
2399 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2400
2401 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2402
2403 MI.eraseFromParent();
2404 break;
2405 }
2406 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2407 MachineFunction &MF = *MBB.getParent();
2408 Register Reg = MI.getOperand(0).getReg();
2409 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2410 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2411 MachineOperand OpLo = MI.getOperand(1);
2412 MachineOperand OpHi = MI.getOperand(2);
2413
2414 // Create a bundle so these instructions won't be re-ordered by the
2415 // post-RA scheduler.
2416 MIBundleBuilder Bundler(MBB, MI);
2417 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2418
2419 // What we want here is an offset from the value returned by s_getpc (which
2420 // is the address of the s_add_u32 instruction) to the global variable, but
2421 // since the encoding of $symbol starts 4 bytes after the start of the
2422 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2423 // small. This requires us to add 4 to the global variable offset in order
2424 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2425 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2426 // instruction.
2427
2428 int64_t Adjust = 0;
2429 if (ST.hasGetPCZeroExtension()) {
2430 // Fix up hardware that does not sign-extend the 48-bit PC value by
2431 // inserting: s_sext_i32_i16 reghi, reghi
2432 Bundler.append(
2433 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2434 Adjust += 4;
2435 }
2436
2437 if (OpLo.isGlobal())
2438 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2439 Bundler.append(
2440 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2441
2442 if (OpHi.isGlobal())
2443 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2444 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2445 .addReg(RegHi)
2446 .add(OpHi));
2447
2448 finalizeBundle(MBB, Bundler.begin());
2449
2450 MI.eraseFromParent();
2451 break;
2452 }
2453 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2454 MachineFunction &MF = *MBB.getParent();
2455 Register Reg = MI.getOperand(0).getReg();
2456 MachineOperand Op = MI.getOperand(1);
2457
2458 // Create a bundle so these instructions won't be re-ordered by the
2459 // post-RA scheduler.
2460 MIBundleBuilder Bundler(MBB, MI);
2461 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2462 if (Op.isGlobal())
2463 Op.setOffset(Op.getOffset() + 4);
2464 Bundler.append(
2465 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2466
2467 finalizeBundle(MBB, Bundler.begin());
2468
2469 MI.eraseFromParent();
2470 break;
2471 }
2472 case AMDGPU::ENTER_STRICT_WWM: {
2473 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2474 // Whole Wave Mode is entered.
2475 MI.setDesc(get(LMC.OrSaveExecOpc));
2476 break;
2477 }
2478 case AMDGPU::ENTER_STRICT_WQM: {
2479 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2480 // STRICT_WQM is entered.
2481 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2482 .addReg(LMC.ExecReg);
2483 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2484
2485 MI.eraseFromParent();
2486 break;
2487 }
2488 case AMDGPU::EXIT_STRICT_WWM:
2489 case AMDGPU::EXIT_STRICT_WQM: {
2490 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2491 // WWM/STICT_WQM is exited.
2492 MI.setDesc(get(LMC.MovOpc));
2493 break;
2494 }
2495 case AMDGPU::SI_RETURN: {
2496 const MachineFunction *MF = MBB.getParent();
2497 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2498 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2499 // Hiding the return address use with SI_RETURN may lead to extra kills in
2500 // the function and missing live-ins. We are fine in practice because callee
2501 // saved register handling ensures the register value is restored before
2502 // RET, but we need the undef flag here to appease the MachineVerifier
2503 // liveness checks.
2505 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2506 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2507
2508 MIB.copyImplicitOps(MI);
2509 MI.eraseFromParent();
2510 break;
2511 }
2512
2513 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2514 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2515 MI.setDesc(get(AMDGPU::S_MUL_U64));
2516 break;
2517
2518 case AMDGPU::S_GETPC_B64_pseudo:
2519 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2520 if (ST.hasGetPCZeroExtension()) {
2521 Register Dst = MI.getOperand(0).getReg();
2522 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2523 // Fix up hardware that does not sign-extend the 48-bit PC value by
2524 // inserting: s_sext_i32_i16 dsthi, dsthi
2525 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2526 DstHi)
2527 .addReg(DstHi);
2528 }
2529 break;
2530
2531 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2532 assert(ST.hasBF16PackedInsts());
2533 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2534 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2535 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2536 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2537 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2538 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2539 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2540 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2541 break;
2542 }
2543
2544 case AMDGPU::GET_STACK_BASE:
2545 // The stack starts at offset 0 unless we need to reserve some space at the
2546 // bottom.
2547 if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
2548 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2549 // some of the VGPRs. The size of the required scratch space has already
2550 // been computed by prolog epilog insertion.
2551 const SIMachineFunctionInfo *MFI =
2552 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2553 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2554 Register DestReg = MI.getOperand(0).getReg();
2555 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
2558 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2559 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2560 // SCC, so we need to check for 0 manually.
2561 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
2562 // Change the implicif-def of SCC to an explicit use (but first remove
2563 // the dead flag if present).
2564 MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
2565 MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
2566 MI.setDesc(get(AMDGPU::S_CMOVK_I32));
2567 MI.addOperand(MachineOperand::CreateImm(VGPRSize));
2568 } else {
2569 MI.setDesc(get(AMDGPU::S_MOV_B32));
2570 MI.addOperand(MachineOperand::CreateImm(0));
2571 MI.removeOperand(
2572 MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2573 }
2574 break;
2575 }
2576
2577 return true;
2578}
2579
2582 unsigned SubIdx, const MachineInstr &Orig,
2583 LaneBitmask UsedLanes) const {
2584
2585 // Try shrinking the instruction to remat only the part needed for current
2586 // context.
2587 // TODO: Handle more cases.
2588 unsigned Opcode = Orig.getOpcode();
2589 switch (Opcode) {
2590 case AMDGPU::S_MOV_B64:
2591 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2592 if (SubIdx != 0)
2593 break;
2594
2595 if (!Orig.getOperand(1).isImm())
2596 break;
2597
2598 // Shrink S_MOV_B64 to S_MOV_B32 when UsedLanes indicates only a single
2599 // 32-bit lane of the 64-bit value is live at the rematerialization point.
2600 if (UsedLanes.all())
2601 break;
2602
2603 // Determine which half of the 64-bit immediate corresponds to the use.
2604 unsigned OrigSubReg = Orig.getOperand(0).getSubReg();
2605 unsigned LoSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub0);
2606 unsigned HiSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub1);
2607
2608 bool NeedLo = (UsedLanes & RI.getSubRegIndexLaneMask(LoSubReg)).any();
2609 bool NeedHi = (UsedLanes & RI.getSubRegIndexLaneMask(HiSubReg)).any();
2610
2611 if (NeedLo && NeedHi)
2612 break;
2613
2614 int64_t Imm64 = Orig.getOperand(1).getImm();
2615 int32_t Imm32 = NeedLo ? Lo_32(Imm64) : Hi_32(Imm64);
2616
2617 unsigned UseSubReg = NeedLo ? LoSubReg : HiSubReg;
2618
2619 // Emit S_MOV_B32 defining just the needed 32-bit subreg of DestReg.
2620 BuildMI(MBB, I, Orig.getDebugLoc(), get(AMDGPU::S_MOV_B32))
2621 .addReg(DestReg, RegState::Define | RegState::Undef, UseSubReg)
2622 .addImm(Imm32);
2623 return;
2624 }
2625
2626 case AMDGPU::S_LOAD_DWORDX16_IMM:
2627 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2628 if (SubIdx != 0)
2629 break;
2630
2631 if (I == MBB.end())
2632 break;
2633
2634 if (I->isBundled())
2635 break;
2636
2637 // Look for a single use of the register that is also a subreg.
2638 Register RegToFind = Orig.getOperand(0).getReg();
2639 MachineOperand *UseMO = nullptr;
2640 for (auto &CandMO : I->operands()) {
2641 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2642 continue;
2643 if (UseMO) {
2644 UseMO = nullptr;
2645 break;
2646 }
2647 UseMO = &CandMO;
2648 }
2649 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2650 break;
2651
2652 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2653 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2654
2655 MachineFunction *MF = MBB.getParent();
2656 MachineRegisterInfo &MRI = MF->getRegInfo();
2657 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2658
2659 unsigned NewOpcode = -1;
2660 if (SubregSize == 256)
2661 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2662 else if (SubregSize == 128)
2663 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2664 else
2665 break;
2666
2667 const MCInstrDesc &TID = get(NewOpcode);
2668 const TargetRegisterClass *NewRC =
2669 RI.getAllocatableClass(getRegClass(TID, 0));
2670 MRI.setRegClass(DestReg, NewRC);
2671
2672 UseMO->setReg(DestReg);
2673 UseMO->setSubReg(AMDGPU::NoSubRegister);
2674
2675 // Use a smaller load with the desired size, possibly with updated offset.
2676 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2677 MI->setDesc(TID);
2678 MI->getOperand(0).setReg(DestReg);
2679 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2680 if (Offset) {
2681 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2682 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2683 OffsetMO->setImm(FinalOffset);
2684 }
2686 for (const MachineMemOperand *MemOp : Orig.memoperands())
2687 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2688 SubregSize / 8));
2689 MI->setMemRefs(*MF, NewMMOs);
2690
2691 MBB.insert(I, MI);
2692 return;
2693 }
2694
2695 default:
2696 break;
2697 }
2698
2699 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, UsedLanes);
2700}
2701
2702std::pair<MachineInstr*, MachineInstr*>
2704 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2705
2706 if (ST.hasVMovB64Inst() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2708 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2709 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2710 return std::pair(&MI, nullptr);
2711 }
2712
2713 MachineBasicBlock &MBB = *MI.getParent();
2714 DebugLoc DL = MBB.findDebugLoc(MI);
2715 MachineFunction *MF = MBB.getParent();
2716 MachineRegisterInfo &MRI = MF->getRegInfo();
2717 Register Dst = MI.getOperand(0).getReg();
2718 unsigned Part = 0;
2719 MachineInstr *Split[2];
2720
2721 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2722 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2723 if (Dst.isPhysical()) {
2724 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2725 } else {
2726 assert(MRI.isSSA());
2727 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2728 MovDPP.addDef(Tmp);
2729 }
2730
2731 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2732 const MachineOperand &SrcOp = MI.getOperand(I);
2733 assert(!SrcOp.isFPImm());
2734 if (SrcOp.isImm()) {
2735 APInt Imm(64, SrcOp.getImm());
2736 Imm.ashrInPlace(Part * 32);
2737 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2738 } else {
2739 assert(SrcOp.isReg());
2740 Register Src = SrcOp.getReg();
2741 if (Src.isPhysical())
2742 MovDPP.addReg(RI.getSubReg(Src, Sub));
2743 else
2744 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2745 }
2746 }
2747
2748 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2749 MovDPP.addImm(MO.getImm());
2750
2751 Split[Part] = MovDPP;
2752 ++Part;
2753 }
2754
2755 if (Dst.isVirtual())
2756 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2757 .addReg(Split[0]->getOperand(0).getReg())
2758 .addImm(AMDGPU::sub0)
2759 .addReg(Split[1]->getOperand(0).getReg())
2760 .addImm(AMDGPU::sub1);
2761
2762 MI.eraseFromParent();
2763 return std::pair(Split[0], Split[1]);
2764}
2765
2766std::optional<DestSourcePair>
2768 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2769 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2770
2771 return std::nullopt;
2772}
2773
2775 AMDGPU::OpName Src0OpName,
2776 MachineOperand &Src1,
2777 AMDGPU::OpName Src1OpName) const {
2778 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2779 if (!Src0Mods)
2780 return false;
2781
2782 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2783 assert(Src1Mods &&
2784 "All commutable instructions have both src0 and src1 modifiers");
2785
2786 int Src0ModsVal = Src0Mods->getImm();
2787 int Src1ModsVal = Src1Mods->getImm();
2788
2789 Src1Mods->setImm(Src0ModsVal);
2790 Src0Mods->setImm(Src1ModsVal);
2791 return true;
2792}
2793
2795 MachineOperand &RegOp,
2796 MachineOperand &NonRegOp) {
2797 Register Reg = RegOp.getReg();
2798 unsigned SubReg = RegOp.getSubReg();
2799 bool IsKill = RegOp.isKill();
2800 bool IsDead = RegOp.isDead();
2801 bool IsUndef = RegOp.isUndef();
2802 bool IsDebug = RegOp.isDebug();
2803
2804 if (NonRegOp.isImm())
2805 RegOp.ChangeToImmediate(NonRegOp.getImm());
2806 else if (NonRegOp.isFI())
2807 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2808 else if (NonRegOp.isGlobal()) {
2809 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2810 NonRegOp.getTargetFlags());
2811 } else
2812 return nullptr;
2813
2814 // Make sure we don't reinterpret a subreg index in the target flags.
2815 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2816
2817 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2818 NonRegOp.setSubReg(SubReg);
2819
2820 return &MI;
2821}
2822
2824 MachineOperand &NonRegOp1,
2825 MachineOperand &NonRegOp2) {
2826 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2827 int64_t NonRegVal = NonRegOp1.getImm();
2828
2829 NonRegOp1.setImm(NonRegOp2.getImm());
2830 NonRegOp2.setImm(NonRegVal);
2831 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2832 NonRegOp2.setTargetFlags(TargetFlags);
2833 return &MI;
2834}
2835
2836bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2837 unsigned OpIdx1) const {
2838 const MCInstrDesc &InstDesc = MI.getDesc();
2839 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2840 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2841
2842 unsigned Opc = MI.getOpcode();
2843 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2844
2845 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2846 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2847
2848 // Swap doesn't breach constant bus or literal limits
2849 // It may move literal to position other than src0, this is not allowed
2850 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2851 // FIXME: After gfx9, literal can be in place other than Src0
2852 if (isVALU(MI)) {
2853 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2854 !isInlineConstant(MO0, OpInfo1))
2855 return false;
2856 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2857 !isInlineConstant(MO1, OpInfo0))
2858 return false;
2859 }
2860
2861 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2862 if (OpInfo1.RegClass == -1)
2863 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2864 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2865 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2866 }
2867 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2868 if (OpInfo0.RegClass == -1)
2869 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2870 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2871 isLegalRegOperand(MI, OpIdx0, MO1);
2872 }
2873
2874 // No need to check 64-bit literals since swapping does not bring new
2875 // 64-bit literals into current instruction to fold to 32-bit
2876
2877 return isImmOperandLegal(MI, OpIdx1, MO0);
2878}
2879
2881 unsigned Src0Idx,
2882 unsigned Src1Idx) const {
2883 assert(!NewMI && "this should never be used");
2884
2885 unsigned Opc = MI.getOpcode();
2886 int CommutedOpcode = commuteOpcode(Opc);
2887 if (CommutedOpcode == -1)
2888 return nullptr;
2889
2890 if (Src0Idx > Src1Idx)
2891 std::swap(Src0Idx, Src1Idx);
2892
2893 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2894 static_cast<int>(Src0Idx) &&
2895 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2896 static_cast<int>(Src1Idx) &&
2897 "inconsistency with findCommutedOpIndices");
2898
2899 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2900 return nullptr;
2901
2902 MachineInstr *CommutedMI = nullptr;
2903 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2904 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2905 if (Src0.isReg() && Src1.isReg()) {
2906 // Be sure to copy the source modifiers to the right place.
2907 CommutedMI =
2908 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2909 } else if (Src0.isReg() && !Src1.isReg()) {
2910 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2911 } else if (!Src0.isReg() && Src1.isReg()) {
2912 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2913 } else if (Src0.isImm() && Src1.isImm()) {
2914 CommutedMI = swapImmOperands(MI, Src0, Src1);
2915 } else {
2916 // FIXME: Found two non registers to commute. This does happen.
2917 return nullptr;
2918 }
2919
2920 if (CommutedMI) {
2921 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2922 Src1, AMDGPU::OpName::src1_modifiers);
2923
2924 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2925 AMDGPU::OpName::src1_sel);
2926
2927 CommutedMI->setDesc(get(CommutedOpcode));
2928 }
2929
2930 return CommutedMI;
2931}
2932
2933// This needs to be implemented because the source modifiers may be inserted
2934// between the true commutable operands, and the base
2935// TargetInstrInfo::commuteInstruction uses it.
2937 unsigned &SrcOpIdx0,
2938 unsigned &SrcOpIdx1) const {
2939 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2940}
2941
2943 unsigned &SrcOpIdx0,
2944 unsigned &SrcOpIdx1) const {
2945 if (!Desc.isCommutable())
2946 return false;
2947
2948 unsigned Opc = Desc.getOpcode();
2949 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2950 if (Src0Idx == -1)
2951 return false;
2952
2953 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2954 if (Src1Idx == -1)
2955 return false;
2956
2957 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2958}
2959
2961 int64_t BrOffset) const {
2962 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2963 // because its dest block is unanalyzable.
2964 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2965
2966 // Convert to dwords.
2967 BrOffset /= 4;
2968
2969 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2970 // from the next instruction.
2971 BrOffset -= 1;
2972
2973 return isIntN(BranchOffsetBits, BrOffset);
2974}
2975
2978 return MI.getOperand(0).getMBB();
2979}
2980
2982 for (const MachineInstr &MI : MBB->terminators()) {
2983 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2984 MI.getOpcode() == AMDGPU::SI_LOOP)
2985 return true;
2986 }
2987 return false;
2988}
2989
2991 MachineBasicBlock &DestBB,
2992 MachineBasicBlock &RestoreBB,
2993 const DebugLoc &DL, int64_t BrOffset,
2994 RegScavenger *RS) const {
2995 assert(MBB.empty() &&
2996 "new block should be inserted for expanding unconditional branch");
2997 assert(MBB.pred_size() == 1);
2998 assert(RestoreBB.empty() &&
2999 "restore block should be inserted for restoring clobbered registers");
3000
3001 MachineFunction *MF = MBB.getParent();
3002 MachineRegisterInfo &MRI = MF->getRegInfo();
3004 auto I = MBB.end();
3005 auto &MCCtx = MF->getContext();
3006
3007 if (ST.useAddPC64Inst()) {
3008 MCSymbol *Offset =
3009 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
3010 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
3012 MCSymbol *PostAddPCLabel =
3013 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
3014 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
3015 auto *OffsetExpr = MCBinaryExpr::createSub(
3016 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
3017 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
3018 Offset->setVariableValue(OffsetExpr);
3019 return;
3020 }
3021
3022 assert(RS && "RegScavenger required for long branching");
3023
3024 // FIXME: Virtual register workaround for RegScavenger not working with empty
3025 // blocks.
3026 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3027
3028 // Note: as this is used after hazard recognizer we need to apply some hazard
3029 // workarounds directly.
3030 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
3031 ST.hasVALUReadSGPRHazard();
3032 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
3033 if (FlushSGPRWrites)
3034 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
3036 };
3037
3038 // We need to compute the offset relative to the instruction immediately after
3039 // s_getpc_b64. Insert pc arithmetic code before last terminator.
3040 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
3041 ApplyHazardWorkarounds();
3042
3043 MCSymbol *PostGetPCLabel =
3044 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
3045 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
3046
3047 MCSymbol *OffsetLo =
3048 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
3049 MCSymbol *OffsetHi =
3050 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
3051 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
3052 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
3053 .addReg(PCReg, {}, AMDGPU::sub0)
3054 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
3055 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
3056 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
3057 .addReg(PCReg, {}, AMDGPU::sub1)
3058 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
3059 ApplyHazardWorkarounds();
3060
3061 // Insert the indirect branch after the other terminator.
3062 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
3063 .addReg(PCReg);
3064
3065 // If a spill is needed for the pc register pair, we need to insert a spill
3066 // restore block right before the destination block, and insert a short branch
3067 // into the old destination block's fallthrough predecessor.
3068 // e.g.:
3069 //
3070 // s_cbranch_scc0 skip_long_branch:
3071 //
3072 // long_branch_bb:
3073 // spill s[8:9]
3074 // s_getpc_b64 s[8:9]
3075 // s_add_u32 s8, s8, restore_bb
3076 // s_addc_u32 s9, s9, 0
3077 // s_setpc_b64 s[8:9]
3078 //
3079 // skip_long_branch:
3080 // foo;
3081 //
3082 // .....
3083 //
3084 // dest_bb_fallthrough_predecessor:
3085 // bar;
3086 // s_branch dest_bb
3087 //
3088 // restore_bb:
3089 // restore s[8:9]
3090 // fallthrough dest_bb
3091 ///
3092 // dest_bb:
3093 // buzz;
3094
3095 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3096 Register Scav;
3097
3098 // If we've previously reserved a register for long branches
3099 // avoid running the scavenger and just use those registers
3100 if (LongBranchReservedReg) {
3101 RS->enterBasicBlock(MBB);
3102 Scav = LongBranchReservedReg;
3103 } else {
3104 RS->enterBasicBlockEnd(MBB);
3105 Scav = RS->scavengeRegisterBackwards(
3106 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3107 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3108 }
3109 if (Scav) {
3110 RS->setRegUsed(Scav);
3111 MRI.replaceRegWith(PCReg, Scav);
3112 MRI.clearVirtRegs();
3113 } else {
3114 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3115 // SGPR spill.
3116 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3117 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3118 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3119 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3120 MRI.clearVirtRegs();
3121 }
3122
3123 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3124 // Now, the distance could be defined.
3126 MCSymbolRefExpr::create(DestLabel, MCCtx),
3127 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3128 // Add offset assignments.
3129 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3130 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3131 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3132 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3133}
3134
3135unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3136 switch (Cond) {
3137 case SIInstrInfo::SCC_TRUE:
3138 return AMDGPU::S_CBRANCH_SCC1;
3139 case SIInstrInfo::SCC_FALSE:
3140 return AMDGPU::S_CBRANCH_SCC0;
3141 case SIInstrInfo::VCCNZ:
3142 return AMDGPU::S_CBRANCH_VCCNZ;
3143 case SIInstrInfo::VCCZ:
3144 return AMDGPU::S_CBRANCH_VCCZ;
3145 case SIInstrInfo::EXECNZ:
3146 return AMDGPU::S_CBRANCH_EXECNZ;
3147 case SIInstrInfo::EXECZ:
3148 return AMDGPU::S_CBRANCH_EXECZ;
3149 default:
3150 llvm_unreachable("invalid branch predicate");
3151 }
3152}
3153
3154SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3155 switch (Opcode) {
3156 case AMDGPU::S_CBRANCH_SCC0:
3157 return SCC_FALSE;
3158 case AMDGPU::S_CBRANCH_SCC1:
3159 return SCC_TRUE;
3160 case AMDGPU::S_CBRANCH_VCCNZ:
3161 return VCCNZ;
3162 case AMDGPU::S_CBRANCH_VCCZ:
3163 return VCCZ;
3164 case AMDGPU::S_CBRANCH_EXECNZ:
3165 return EXECNZ;
3166 case AMDGPU::S_CBRANCH_EXECZ:
3167 return EXECZ;
3168 default:
3169 return INVALID_BR;
3170 }
3171}
3172
3176 MachineBasicBlock *&FBB,
3178 bool AllowModify) const {
3179 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3180 // Unconditional Branch
3181 TBB = I->getOperand(0).getMBB();
3182 return false;
3183 }
3184
3185 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3186 if (Pred == INVALID_BR)
3187 return true;
3188
3189 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3190 Cond.push_back(MachineOperand::CreateImm(Pred));
3191 Cond.push_back(I->getOperand(1)); // Save the branch register.
3192
3193 ++I;
3194
3195 if (I == MBB.end()) {
3196 // Conditional branch followed by fall-through.
3197 TBB = CondBB;
3198 return false;
3199 }
3200
3201 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3202 TBB = CondBB;
3203 FBB = I->getOperand(0).getMBB();
3204 return false;
3205 }
3206
3207 return true;
3208}
3209
3211 MachineBasicBlock *&FBB,
3213 bool AllowModify) const {
3214 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3215 auto E = MBB.end();
3216 if (I == E)
3217 return false;
3218
3219 // Skip over the instructions that are artificially terminators for special
3220 // exec management.
3221 while (I != E && !I->isBranch() && !I->isReturn()) {
3222 switch (I->getOpcode()) {
3223 case AMDGPU::S_MOV_B64_term:
3224 case AMDGPU::S_XOR_B64_term:
3225 case AMDGPU::S_OR_B64_term:
3226 case AMDGPU::S_ANDN2_B64_term:
3227 case AMDGPU::S_AND_B64_term:
3228 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3229 case AMDGPU::S_MOV_B32_term:
3230 case AMDGPU::S_XOR_B32_term:
3231 case AMDGPU::S_OR_B32_term:
3232 case AMDGPU::S_ANDN2_B32_term:
3233 case AMDGPU::S_AND_B32_term:
3234 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3235 break;
3236 case AMDGPU::SI_IF:
3237 case AMDGPU::SI_ELSE:
3238 case AMDGPU::SI_KILL_I1_TERMINATOR:
3239 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3240 // FIXME: It's messy that these need to be considered here at all.
3241 return true;
3242 default:
3243 llvm_unreachable("unexpected non-branch terminator inst");
3244 }
3245
3246 ++I;
3247 }
3248
3249 if (I == E)
3250 return false;
3251
3252 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3253}
3254
3256 int *BytesRemoved) const {
3257 unsigned Count = 0;
3258 unsigned RemovedSize = 0;
3259 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3260 // Skip over artificial terminators when removing instructions.
3261 if (MI.isBranch() || MI.isReturn()) {
3262 RemovedSize += getInstSizeInBytes(MI);
3263 MI.eraseFromParent();
3264 ++Count;
3265 }
3266 }
3267
3268 if (BytesRemoved)
3269 *BytesRemoved = RemovedSize;
3270
3271 return Count;
3272}
3273
3274// Copy the flags onto the implicit condition register operand.
3276 const MachineOperand &OrigCond) {
3277 CondReg.setIsUndef(OrigCond.isUndef());
3278 CondReg.setIsKill(OrigCond.isKill());
3279}
3280
3283 MachineBasicBlock *FBB,
3285 const DebugLoc &DL,
3286 int *BytesAdded) const {
3287 if (!FBB && Cond.empty()) {
3288 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3289 .addMBB(TBB);
3290 if (BytesAdded)
3291 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3292 return 1;
3293 }
3294
3295 assert(TBB && Cond[0].isImm());
3296
3297 unsigned Opcode
3298 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3299
3300 if (!FBB) {
3301 MachineInstr *CondBr =
3302 BuildMI(&MBB, DL, get(Opcode))
3303 .addMBB(TBB);
3304
3305 // Copy the flags onto the implicit condition register operand.
3306 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3307 fixImplicitOperands(*CondBr);
3308
3309 if (BytesAdded)
3310 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3311 return 1;
3312 }
3313
3314 assert(TBB && FBB);
3315
3316 MachineInstr *CondBr =
3317 BuildMI(&MBB, DL, get(Opcode))
3318 .addMBB(TBB);
3319 fixImplicitOperands(*CondBr);
3320 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3321 .addMBB(FBB);
3322
3323 MachineOperand &CondReg = CondBr->getOperand(1);
3324 CondReg.setIsUndef(Cond[1].isUndef());
3325 CondReg.setIsKill(Cond[1].isKill());
3326
3327 if (BytesAdded)
3328 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3329
3330 return 2;
3331}
3332
3335 if (Cond.size() != 2) {
3336 return true;
3337 }
3338
3339 if (Cond[0].isImm()) {
3340 Cond[0].setImm(-Cond[0].getImm());
3341 return false;
3342 }
3343
3344 return true;
3345}
3346
3349 Register DstReg, Register TrueReg,
3350 Register FalseReg, int &CondCycles,
3351 int &TrueCycles, int &FalseCycles) const {
3352 switch (Cond[0].getImm()) {
3353 case VCCNZ:
3354 case VCCZ: {
3355 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3356 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3357 if (MRI.getRegClass(FalseReg) != RC)
3358 return false;
3359
3360 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3361 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3362
3363 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3364 return RI.hasVGPRs(RC) && NumInsts <= 6;
3365 }
3366 case SCC_TRUE:
3367 case SCC_FALSE: {
3368 // FIXME: We could insert for VGPRs if we could replace the original compare
3369 // with a vector one.
3370 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3371 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3372 if (MRI.getRegClass(FalseReg) != RC)
3373 return false;
3374
3375 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3376
3377 // Multiples of 8 can do s_cselect_b64
3378 if (NumInsts % 2 == 0)
3379 NumInsts /= 2;
3380
3381 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3382 return RI.isSGPRClass(RC);
3383 }
3384 default:
3385 return false;
3386 }
3387}
3388
3392 Register TrueReg, Register FalseReg) const {
3393 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3394 if (Pred == VCCZ || Pred == SCC_FALSE) {
3395 Pred = static_cast<BranchPredicate>(-Pred);
3396 std::swap(TrueReg, FalseReg);
3397 }
3398
3399 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3400 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3401 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3402
3403 if (DstSize == 32) {
3405 if (Pred == SCC_TRUE) {
3406 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3407 .addReg(TrueReg)
3408 .addReg(FalseReg);
3409 } else {
3410 // Instruction's operands are backwards from what is expected.
3411 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3412 .addReg(FalseReg)
3413 .addReg(TrueReg);
3414 }
3415
3416 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3417 return;
3418 }
3419
3420 if (DstSize == 64 && Pred == SCC_TRUE) {
3422 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3423 .addReg(TrueReg)
3424 .addReg(FalseReg);
3425
3426 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3427 return;
3428 }
3429
3430 static const int16_t Sub0_15[] = {
3431 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3432 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3433 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3434 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3435 };
3436
3437 static const int16_t Sub0_15_64[] = {
3438 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3439 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3440 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3441 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3442 };
3443
3444 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3445 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3446 const int16_t *SubIndices = Sub0_15;
3447 int NElts = DstSize / 32;
3448
3449 // 64-bit select is only available for SALU.
3450 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3451 if (Pred == SCC_TRUE) {
3452 if (NElts % 2) {
3453 SelOp = AMDGPU::S_CSELECT_B32;
3454 EltRC = &AMDGPU::SGPR_32RegClass;
3455 } else {
3456 SelOp = AMDGPU::S_CSELECT_B64;
3457 EltRC = &AMDGPU::SGPR_64RegClass;
3458 SubIndices = Sub0_15_64;
3459 NElts /= 2;
3460 }
3461 }
3462
3464 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3465
3466 I = MIB->getIterator();
3467
3469 for (int Idx = 0; Idx != NElts; ++Idx) {
3470 Register DstElt = MRI.createVirtualRegister(EltRC);
3471 Regs.push_back(DstElt);
3472
3473 unsigned SubIdx = SubIndices[Idx];
3474
3476 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3477 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3478 .addReg(FalseReg, {}, SubIdx)
3479 .addReg(TrueReg, {}, SubIdx);
3480 } else {
3481 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3482 .addReg(TrueReg, {}, SubIdx)
3483 .addReg(FalseReg, {}, SubIdx);
3484 }
3485
3486 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3488
3489 MIB.addReg(DstElt)
3490 .addImm(SubIdx);
3491 }
3492}
3493
3495 switch (MI.getOpcode()) {
3496 case AMDGPU::V_MOV_B16_t16_e32:
3497 case AMDGPU::V_MOV_B16_t16_e64:
3498 case AMDGPU::V_MOV_B32_e32:
3499 case AMDGPU::V_MOV_B32_e64:
3500 case AMDGPU::V_MOV_B64_PSEUDO:
3501 case AMDGPU::V_MOV_B64_e32:
3502 case AMDGPU::V_MOV_B64_e64:
3503 case AMDGPU::S_MOV_B32:
3504 case AMDGPU::S_MOV_B64:
3505 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3506 case AMDGPU::COPY:
3507 case AMDGPU::WWM_COPY:
3508 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3509 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3510 case AMDGPU::V_ACCVGPR_MOV_B32:
3511 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3512 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3513 return true;
3514 default:
3515 return false;
3516 }
3517}
3518
3520 switch (MI.getOpcode()) {
3521 case AMDGPU::V_MOV_B16_t16_e32:
3522 case AMDGPU::V_MOV_B16_t16_e64:
3523 return 2;
3524 case AMDGPU::V_MOV_B32_e32:
3525 case AMDGPU::V_MOV_B32_e64:
3526 case AMDGPU::V_MOV_B64_PSEUDO:
3527 case AMDGPU::V_MOV_B64_e32:
3528 case AMDGPU::V_MOV_B64_e64:
3529 case AMDGPU::S_MOV_B32:
3530 case AMDGPU::S_MOV_B64:
3531 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3532 case AMDGPU::COPY:
3533 case AMDGPU::WWM_COPY:
3534 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3535 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3536 case AMDGPU::V_ACCVGPR_MOV_B32:
3537 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3538 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3539 return 1;
3540 default:
3541 llvm_unreachable("MI is not a foldable copy");
3542 }
3543}
3544
3545static constexpr AMDGPU::OpName ModifierOpNames[] = {
3546 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3547 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3548 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3549
3551 unsigned Opc = MI.getOpcode();
3552 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3553 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3554 if (Idx >= 0)
3555 MI.removeOperand(Idx);
3556 }
3557}
3558
3560 const MCInstrDesc &NewDesc) const {
3561 MI.setDesc(NewDesc);
3562
3563 // Remove any leftover implicit operands from mutating the instruction. e.g.
3564 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3565 // anymore.
3566 const MCInstrDesc &Desc = MI.getDesc();
3567 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3568 Desc.implicit_defs().size();
3569
3570 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3571 MI.removeOperand(I);
3572}
3573
3574std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3575 unsigned SubRegIndex) {
3576 switch (SubRegIndex) {
3577 case AMDGPU::NoSubRegister:
3578 return Imm;
3579 case AMDGPU::sub0:
3580 return SignExtend64<32>(Imm);
3581 case AMDGPU::sub1:
3582 return SignExtend64<32>(Imm >> 32);
3583 case AMDGPU::lo16:
3584 return SignExtend64<16>(Imm);
3585 case AMDGPU::hi16:
3586 return SignExtend64<16>(Imm >> 16);
3587 case AMDGPU::sub1_lo16:
3588 return SignExtend64<16>(Imm >> 32);
3589 case AMDGPU::sub1_hi16:
3590 return SignExtend64<16>(Imm >> 48);
3591 default:
3592 return std::nullopt;
3593 }
3594
3595 llvm_unreachable("covered subregister switch");
3596}
3597
3598static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3599 switch (Opc) {
3600 case AMDGPU::V_MAC_F16_e32:
3601 case AMDGPU::V_MAC_F16_e64:
3602 case AMDGPU::V_MAD_F16_e64:
3603 return AMDGPU::V_MADAK_F16;
3604 case AMDGPU::V_MAC_F32_e32:
3605 case AMDGPU::V_MAC_F32_e64:
3606 case AMDGPU::V_MAD_F32_e64:
3607 return AMDGPU::V_MADAK_F32;
3608 case AMDGPU::V_FMAC_F32_e32:
3609 case AMDGPU::V_FMAC_F32_e64:
3610 case AMDGPU::V_FMA_F32_e64:
3611 return AMDGPU::V_FMAAK_F32;
3612 case AMDGPU::V_FMAC_F16_e32:
3613 case AMDGPU::V_FMAC_F16_e64:
3614 case AMDGPU::V_FMAC_F16_t16_e64:
3615 case AMDGPU::V_FMAC_F16_fake16_e64:
3616 case AMDGPU::V_FMAC_F16_t16_e32:
3617 case AMDGPU::V_FMAC_F16_fake16_e32:
3618 case AMDGPU::V_FMA_F16_e64:
3619 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3620 ? AMDGPU::V_FMAAK_F16_t16
3621 : AMDGPU::V_FMAAK_F16_fake16
3622 : AMDGPU::V_FMAAK_F16;
3623 case AMDGPU::V_FMAC_F64_e32:
3624 case AMDGPU::V_FMAC_F64_e64:
3625 case AMDGPU::V_FMA_F64_e64:
3626 return AMDGPU::V_FMAAK_F64;
3627 default:
3628 llvm_unreachable("invalid instruction");
3629 }
3630}
3631
3632static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3633 switch (Opc) {
3634 case AMDGPU::V_MAC_F16_e32:
3635 case AMDGPU::V_MAC_F16_e64:
3636 case AMDGPU::V_MAD_F16_e64:
3637 return AMDGPU::V_MADMK_F16;
3638 case AMDGPU::V_MAC_F32_e32:
3639 case AMDGPU::V_MAC_F32_e64:
3640 case AMDGPU::V_MAD_F32_e64:
3641 return AMDGPU::V_MADMK_F32;
3642 case AMDGPU::V_FMAC_F32_e32:
3643 case AMDGPU::V_FMAC_F32_e64:
3644 case AMDGPU::V_FMA_F32_e64:
3645 return AMDGPU::V_FMAMK_F32;
3646 case AMDGPU::V_FMAC_F16_e32:
3647 case AMDGPU::V_FMAC_F16_e64:
3648 case AMDGPU::V_FMAC_F16_t16_e64:
3649 case AMDGPU::V_FMAC_F16_fake16_e64:
3650 case AMDGPU::V_FMAC_F16_t16_e32:
3651 case AMDGPU::V_FMAC_F16_fake16_e32:
3652 case AMDGPU::V_FMA_F16_e64:
3653 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3654 ? AMDGPU::V_FMAMK_F16_t16
3655 : AMDGPU::V_FMAMK_F16_fake16
3656 : AMDGPU::V_FMAMK_F16;
3657 case AMDGPU::V_FMAC_F64_e32:
3658 case AMDGPU::V_FMAC_F64_e64:
3659 case AMDGPU::V_FMA_F64_e64:
3660 return AMDGPU::V_FMAMK_F64;
3661 default:
3662 llvm_unreachable("invalid instruction");
3663 }
3664}
3665
3667 Register Reg, MachineRegisterInfo *MRI) const {
3668 int64_t Imm;
3669 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3670 return false;
3671
3672 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3673
3674 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3675
3676 unsigned Opc = UseMI.getOpcode();
3677 if (Opc == AMDGPU::COPY) {
3678 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3679
3680 Register DstReg = UseMI.getOperand(0).getReg();
3681 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3682
3683 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3684
3685 if (HasMultipleUses) {
3686 // TODO: This should fold in more cases with multiple use, but we need to
3687 // more carefully consider what those uses are.
3688 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3689
3690 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3691 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3692 return false;
3693
3694 // Most of the time folding a 32-bit inline constant is free (though this
3695 // might not be true if we can't later fold it into a real user).
3696 //
3697 // FIXME: This isInlineConstant check is imprecise if
3698 // getConstValDefinedInReg handled the tricky non-mov cases.
3699 if (ImmDefSize == 32 &&
3701 return false;
3702 }
3703
3704 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3705 RI.getSubRegIdxSize(UseSubReg) == 16;
3706
3707 if (Is16Bit) {
3708 if (RI.hasVGPRs(DstRC))
3709 return false; // Do not clobber vgpr_hi16
3710
3711 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3712 return false;
3713 }
3714
3715 MachineFunction *MF = UseMI.getMF();
3716
3717 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3718 MCRegister MovDstPhysReg =
3719 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3720
3721 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3722
3723 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3724 for (unsigned MovOp :
3725 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3726 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3727 const MCInstrDesc &MovDesc = get(MovOp);
3728
3729 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3730 if (Is16Bit) {
3731 // We just need to find a correctly sized register class, so the
3732 // subregister index compatibility doesn't matter since we're statically
3733 // extracting the immediate value.
3734 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3735 if (!MovDstRC)
3736 continue;
3737
3738 if (MovDstPhysReg) {
3739 // FIXME: We probably should not do this. If there is a live value in
3740 // the high half of the register, it will be corrupted.
3741 MovDstPhysReg =
3742 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3743 if (!MovDstPhysReg)
3744 continue;
3745 }
3746 }
3747
3748 // Result class isn't the right size, try the next instruction.
3749 if (MovDstPhysReg) {
3750 if (!MovDstRC->contains(MovDstPhysReg))
3751 return false;
3752 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3753 // TODO: This will be overly conservative in the case of 16-bit virtual
3754 // SGPRs. We could hack up the virtual register uses to use a compatible
3755 // 32-bit class.
3756 continue;
3757 }
3758
3759 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3760
3761 // Ensure the interpreted immediate value is a valid operand in the new
3762 // mov.
3763 //
3764 // FIXME: isImmOperandLegal should have form that doesn't require existing
3765 // MachineInstr or MachineOperand
3766 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3767 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3768 break;
3769
3770 NewOpc = MovOp;
3771 break;
3772 }
3773
3774 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3775 return false;
3776
3777 if (Is16Bit) {
3778 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3779 if (MovDstPhysReg)
3780 UseMI.getOperand(0).setReg(MovDstPhysReg);
3781 assert(UseMI.getOperand(1).getReg().isVirtual());
3782 }
3783
3784 const MCInstrDesc &NewMCID = get(NewOpc);
3785 UseMI.setDesc(NewMCID);
3786 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3787 UseMI.addImplicitDefUseOperands(*MF);
3788 return true;
3789 }
3790
3791 if (HasMultipleUses)
3792 return false;
3793
3794 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3795 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3796 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3797 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3798 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3799 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3800 Opc == AMDGPU::V_FMAC_F64_e64) {
3801 // Don't fold if we are using source or output modifiers. The new VOP2
3802 // instructions don't have them.
3804 return false;
3805
3806 // If this is a free constant, there's no reason to do this.
3807 // TODO: We could fold this here instead of letting SIFoldOperands do it
3808 // later.
3809 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3810
3811 // Any src operand can be used for the legality check.
3812 if (isInlineConstant(UseMI, Src0Idx, Imm))
3813 return false;
3814
3815 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3816
3817 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3818 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3819
3820 auto CopyRegOperandToNarrowerRC =
3821 [MRI, this](MachineInstr &MI, unsigned OpNo,
3822 const TargetRegisterClass *NewRC) -> void {
3823 if (!MI.getOperand(OpNo).isReg())
3824 return;
3825 Register Reg = MI.getOperand(OpNo).getReg();
3826 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3827 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3828 return;
3829 Register Tmp = MRI->createVirtualRegister(NewRC);
3830 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3831 get(AMDGPU::COPY), Tmp)
3832 .addReg(Reg);
3833 MI.getOperand(OpNo).setReg(Tmp);
3834 MI.getOperand(OpNo).setIsKill();
3835 };
3836
3837 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3838 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3839 (Src1->isReg() && Src1->getReg() == Reg)) {
3840 MachineOperand *RegSrc =
3841 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3842 if (!RegSrc->isReg())
3843 return false;
3844 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3845 ST.getConstantBusLimit(Opc) < 2)
3846 return false;
3847
3848 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3849 return false;
3850
3851 // If src2 is also a literal constant then we have to choose which one to
3852 // fold. In general it is better to choose madak so that the other literal
3853 // can be materialized in an sgpr instead of a vgpr:
3854 // s_mov_b32 s0, literal
3855 // v_madak_f32 v0, s0, v0, literal
3856 // Instead of:
3857 // v_mov_b32 v1, literal
3858 // v_madmk_f32 v0, v0, literal, v1
3859 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3860 if (Def && Def->isMoveImmediate() &&
3861 !isInlineConstant(Def->getOperand(1)))
3862 return false;
3863
3864 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3865 if (pseudoToMCOpcode(NewOpc) == -1)
3866 return false;
3867
3868 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3869 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3870
3871 // FIXME: This would be a lot easier if we could return a new instruction
3872 // instead of having to modify in place.
3873
3874 Register SrcReg = RegSrc->getReg();
3875 unsigned SrcSubReg = RegSrc->getSubReg();
3876 Src0->setReg(SrcReg);
3877 Src0->setSubReg(SrcSubReg);
3878 Src0->setIsKill(RegSrc->isKill());
3879
3880 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3881 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3882 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3883 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3884 UseMI.untieRegOperand(
3885 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3886
3887 Src1->ChangeToImmediate(*SubRegImm);
3888
3890 UseMI.setDesc(get(NewOpc));
3891
3892 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3893 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3894 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3895 Register Tmp = MRI->createVirtualRegister(NewRC);
3896 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3897 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3898 UseMI.getOperand(0).getReg())
3899 .addReg(Tmp, RegState::Kill);
3900 UseMI.getOperand(0).setReg(Tmp);
3901 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3902 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3903 }
3904
3905 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3906 if (DeleteDef)
3907 DefMI.eraseFromParent();
3908
3909 return true;
3910 }
3911
3912 // Added part is the constant: Use v_madak_{f16, f32}.
3913 if (Src2->isReg() && Src2->getReg() == Reg) {
3914 if (ST.getConstantBusLimit(Opc) < 2) {
3915 // Not allowed to use constant bus for another operand.
3916 // We can however allow an inline immediate as src0.
3917 bool Src0Inlined = false;
3918 if (Src0->isReg()) {
3919 // Try to inline constant if possible.
3920 // If the Def moves immediate and the use is single
3921 // We are saving VGPR here.
3922 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3923 if (Def && Def->isMoveImmediate() &&
3924 isInlineConstant(Def->getOperand(1)) &&
3925 MRI->hasOneNonDBGUse(Src0->getReg())) {
3926 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3927 Src0Inlined = true;
3928 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3929 RI.isSGPRReg(*MRI, Src0->getReg())) {
3930 return false;
3931 }
3932 // VGPR is okay as Src0 - fallthrough
3933 }
3934
3935 if (Src1->isReg() && !Src0Inlined) {
3936 // We have one slot for inlinable constant so far - try to fill it
3937 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3938 if (Def && Def->isMoveImmediate() &&
3939 isInlineConstant(Def->getOperand(1)) &&
3940 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3941 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3942 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3943 return false;
3944 // VGPR is okay as Src1 - fallthrough
3945 }
3946 }
3947
3948 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3949 if (pseudoToMCOpcode(NewOpc) == -1)
3950 return false;
3951
3952 // FIXME: This would be a lot easier if we could return a new instruction
3953 // instead of having to modify in place.
3954
3955 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3956 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3957 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3958 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3959 UseMI.untieRegOperand(
3960 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3961
3962 const std::optional<int64_t> SubRegImm =
3963 extractSubregFromImm(Imm, Src2->getSubReg());
3964
3965 // ChangingToImmediate adds Src2 back to the instruction.
3966 Src2->ChangeToImmediate(*SubRegImm);
3967
3968 // These come before src2.
3970 UseMI.setDesc(get(NewOpc));
3971
3972 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3973 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3974 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3975 Register Tmp = MRI->createVirtualRegister(NewRC);
3976 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3977 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3978 UseMI.getOperand(0).getReg())
3979 .addReg(Tmp, RegState::Kill);
3980 UseMI.getOperand(0).setReg(Tmp);
3981 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3982 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
3983 }
3984
3985 // It might happen that UseMI was commuted
3986 // and we now have SGPR as SRC1. If so 2 inlined
3987 // constant and SGPR are illegal.
3989
3990 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3991 if (DeleteDef)
3992 DefMI.eraseFromParent();
3993
3994 return true;
3995 }
3996 }
3997
3998 return false;
3999}
4000
4001static bool
4004 if (BaseOps1.size() != BaseOps2.size())
4005 return false;
4006 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
4007 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
4008 return false;
4009 }
4010 return true;
4011}
4012
4013static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
4014 LocationSize WidthB, int OffsetB) {
4015 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
4016 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
4017 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
4018 return LowWidth.hasValue() &&
4019 LowOffset + (int)LowWidth.getValue() <= HighOffset;
4020}
4021
4022bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
4023 const MachineInstr &MIb) const {
4024 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
4025 int64_t Offset0, Offset1;
4026 LocationSize Dummy0 = LocationSize::precise(0);
4027 LocationSize Dummy1 = LocationSize::precise(0);
4028 bool Offset0IsScalable, Offset1IsScalable;
4029 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
4030 Dummy0, &RI) ||
4031 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
4032 Dummy1, &RI))
4033 return false;
4034
4035 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
4036 return false;
4037
4038 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
4039 // FIXME: Handle ds_read2 / ds_write2.
4040 return false;
4041 }
4042 LocationSize Width0 = MIa.memoperands().front()->getSize();
4043 LocationSize Width1 = MIb.memoperands().front()->getSize();
4044 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
4045}
4046
4048 const MachineInstr &MIb) const {
4049 assert(MIa.mayLoadOrStore() &&
4050 "MIa must load from or modify a memory location");
4051 assert(MIb.mayLoadOrStore() &&
4052 "MIb must load from or modify a memory location");
4053
4055 return false;
4056
4057 // XXX - Can we relax this between address spaces?
4058 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4059 return false;
4060
4061 if (isLDSDMA(MIa) || isLDSDMA(MIb))
4062 return false;
4063
4064 if (MIa.isBundle() || MIb.isBundle())
4065 return false;
4066
4067 // TODO: Should we check the address space from the MachineMemOperand? That
4068 // would allow us to distinguish objects we know don't alias based on the
4069 // underlying address space, even if it was lowered to a different one,
4070 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4071 // buffer.
4072 if (isDS(MIa)) {
4073 if (isDS(MIb))
4074 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4075
4076 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4077 }
4078
4079 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4080 if (isMUBUF(MIb) || isMTBUF(MIb))
4081 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4082
4083 if (isFLAT(MIb))
4084 return isFLATScratch(MIb);
4085
4086 return !isSMRD(MIb);
4087 }
4088
4089 if (isSMRD(MIa)) {
4090 if (isSMRD(MIb))
4091 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4092
4093 if (isFLAT(MIb))
4094 return isFLATScratch(MIb);
4095
4096 return !isMUBUF(MIb) && !isMTBUF(MIb);
4097 }
4098
4099 if (isFLAT(MIa)) {
4100 if (isFLAT(MIb)) {
4101 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4102 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4103 return true;
4104
4105 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4106 }
4107
4108 return false;
4109 }
4110
4111 return false;
4112}
4113
4115 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4116 if (Reg.isPhysical())
4117 return false;
4118 auto *Def = MRI.getUniqueVRegDef(Reg);
4119 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4120 Imm = Def->getOperand(1).getImm();
4121 if (DefMI)
4122 *DefMI = Def;
4123 return true;
4124 }
4125 return false;
4126}
4127
4128static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4129 MachineInstr **DefMI = nullptr) {
4130 if (!MO->isReg())
4131 return false;
4132 const MachineFunction *MF = MO->getParent()->getMF();
4133 const MachineRegisterInfo &MRI = MF->getRegInfo();
4134 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4135}
4136
4138 MachineInstr &NewMI) {
4139 if (LV) {
4140 unsigned NumOps = MI.getNumOperands();
4141 for (unsigned I = 1; I < NumOps; ++I) {
4142 MachineOperand &Op = MI.getOperand(I);
4143 if (Op.isReg() && Op.isKill())
4144 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4145 }
4146 }
4147}
4148
4149static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4150 switch (Opc) {
4151 case AMDGPU::V_MAC_F16_e32:
4152 case AMDGPU::V_MAC_F16_e64:
4153 return AMDGPU::V_MAD_F16_e64;
4154 case AMDGPU::V_MAC_F32_e32:
4155 case AMDGPU::V_MAC_F32_e64:
4156 return AMDGPU::V_MAD_F32_e64;
4157 case AMDGPU::V_MAC_LEGACY_F32_e32:
4158 case AMDGPU::V_MAC_LEGACY_F32_e64:
4159 return AMDGPU::V_MAD_LEGACY_F32_e64;
4160 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4161 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4162 return AMDGPU::V_FMA_LEGACY_F32_e64;
4163 case AMDGPU::V_FMAC_F16_e32:
4164 case AMDGPU::V_FMAC_F16_e64:
4165 case AMDGPU::V_FMAC_F16_t16_e64:
4166 case AMDGPU::V_FMAC_F16_fake16_e64:
4167 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4168 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4169 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4170 : AMDGPU::V_FMA_F16_gfx9_e64;
4171 case AMDGPU::V_FMAC_F32_e32:
4172 case AMDGPU::V_FMAC_F32_e64:
4173 return AMDGPU::V_FMA_F32_e64;
4174 case AMDGPU::V_FMAC_F64_e32:
4175 case AMDGPU::V_FMAC_F64_e64:
4176 return AMDGPU::V_FMA_F64_e64;
4177 default:
4178 llvm_unreachable("invalid instruction");
4179 }
4180}
4181
4182/// Helper struct for the implementation of 3-address conversion to communicate
4183/// updates made to instruction operands.
4185 /// Other instruction whose def is no longer used by the converted
4186 /// instruction.
4188};
4189
4191 LiveVariables *LV,
4192 LiveIntervals *LIS) const {
4193 MachineBasicBlock &MBB = *MI.getParent();
4194 MachineInstr *CandidateMI = &MI;
4195
4196 if (MI.isBundle()) {
4197 // This is a temporary placeholder for bundle handling that enables us to
4198 // exercise the relevant code paths in the two-address instruction pass.
4199 if (MI.getBundleSize() != 1)
4200 return nullptr;
4201 CandidateMI = MI.getNextNode();
4202 }
4203
4205 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4206 if (!NewMI)
4207 return nullptr;
4208
4209 if (MI.isBundle()) {
4210 CandidateMI->eraseFromBundle();
4211
4212 for (MachineOperand &MO : MI.all_defs()) {
4213 if (MO.isTied())
4214 MI.untieRegOperand(MO.getOperandNo());
4215 }
4216 } else {
4217 updateLiveVariables(LV, MI, *NewMI);
4218 if (LIS) {
4219 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4220 // SlotIndex of defs needs to be updated when converting to early-clobber
4221 MachineOperand &Def = NewMI->getOperand(0);
4222 if (Def.isEarlyClobber() && Def.isReg() &&
4223 LIS->hasInterval(Def.getReg())) {
4224 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4225 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4226 auto &LI = LIS->getInterval(Def.getReg());
4227 auto UpdateDefIndex = [&](LiveRange &LR) {
4228 auto *S = LR.find(OldIndex);
4229 if (S != LR.end() && S->start == OldIndex) {
4230 assert(S->valno && S->valno->def == OldIndex);
4231 S->start = NewIndex;
4232 S->valno->def = NewIndex;
4233 }
4234 };
4235 UpdateDefIndex(LI);
4236 for (auto &SR : LI.subranges())
4237 UpdateDefIndex(SR);
4238 }
4239 }
4240 }
4241
4242 if (U.RemoveMIUse) {
4243 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4244 // The only user is the instruction which will be killed.
4245 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4246
4247 if (MRI.hasOneNonDBGUse(DefReg)) {
4248 // We cannot just remove the DefMI here, calling pass will crash.
4249 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4250 U.RemoveMIUse->getOperand(0).setIsDead(true);
4251 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4252 U.RemoveMIUse->removeOperand(I);
4253 if (LV)
4254 LV->getVarInfo(DefReg).AliveBlocks.clear();
4255 }
4256
4257 if (MI.isBundle()) {
4258 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4259 if (!VRI.Reads && !VRI.Writes) {
4260 for (MachineOperand &MO : MI.all_uses()) {
4261 if (MO.isReg() && MO.getReg() == DefReg) {
4262 assert(MO.getSubReg() == 0 &&
4263 "tied sub-registers in bundles currently not supported");
4264 MI.removeOperand(MO.getOperandNo());
4265 break;
4266 }
4267 }
4268
4269 if (LIS)
4270 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4271 }
4272 } else if (LIS) {
4273 LiveInterval &DefLI = LIS->getInterval(DefReg);
4274
4275 // We cannot delete the original instruction here, so hack out the use
4276 // in the original instruction with a dummy register so we can use
4277 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4278 // not have the complexity of deleting a use to consider here.
4279 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4280 for (MachineOperand &MIOp : MI.uses()) {
4281 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4282 MIOp.setIsUndef(true);
4283 MIOp.setReg(DummyReg);
4284 }
4285 }
4286
4287 if (MI.isBundle()) {
4288 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4289 if (!VRI.Reads && !VRI.Writes) {
4290 for (MachineOperand &MIOp : MI.uses()) {
4291 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4292 MIOp.setIsUndef(true);
4293 MIOp.setReg(DummyReg);
4294 }
4295 }
4296 }
4297
4298 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4299 false, /*isUndef=*/true));
4300 }
4301
4302 LIS->shrinkToUses(&DefLI);
4303 }
4304 }
4305
4306 return MI.isBundle() ? &MI : NewMI;
4307}
4308
4310SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4311 ThreeAddressUpdates &U) const {
4312 MachineBasicBlock &MBB = *MI.getParent();
4313 unsigned Opc = MI.getOpcode();
4314
4315 // Handle MFMA.
4316 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4317 if (NewMFMAOpc != -1) {
4319 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4320 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4321 MIB.add(MI.getOperand(I));
4322 return MIB;
4323 }
4324
4325 if (SIInstrInfo::isWMMA(MI)) {
4326 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4327 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4328 .setMIFlags(MI.getFlags());
4329 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4330 MIB->addOperand(MI.getOperand(I));
4331 return MIB;
4332 }
4333
4334 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4335 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4336 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4337 "present pre-RA");
4338
4339 // Handle MAC/FMAC.
4340 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4341 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4342 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4343 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4344 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4345 bool Src0Literal = false;
4346
4347 switch (Opc) {
4348 default:
4349 return nullptr;
4350 case AMDGPU::V_MAC_F16_e64:
4351 case AMDGPU::V_FMAC_F16_e64:
4352 case AMDGPU::V_FMAC_F16_t16_e64:
4353 case AMDGPU::V_FMAC_F16_fake16_e64:
4354 case AMDGPU::V_MAC_F32_e64:
4355 case AMDGPU::V_MAC_LEGACY_F32_e64:
4356 case AMDGPU::V_FMAC_F32_e64:
4357 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4358 case AMDGPU::V_FMAC_F64_e64:
4359 break;
4360 case AMDGPU::V_MAC_F16_e32:
4361 case AMDGPU::V_FMAC_F16_e32:
4362 case AMDGPU::V_MAC_F32_e32:
4363 case AMDGPU::V_MAC_LEGACY_F32_e32:
4364 case AMDGPU::V_FMAC_F32_e32:
4365 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4366 case AMDGPU::V_FMAC_F64_e32: {
4367 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4368 AMDGPU::OpName::src0);
4369 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4370 if (!Src0->isReg() && !Src0->isImm())
4371 return nullptr;
4372
4373 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4374 Src0Literal = true;
4375
4376 break;
4377 }
4378 }
4379
4380 MachineInstrBuilder MIB;
4381 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4382 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4383 const MachineOperand *Src0Mods =
4384 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4385 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4386 const MachineOperand *Src1Mods =
4387 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4388 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4389 const MachineOperand *Src2Mods =
4390 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4391 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4392 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4393 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4394
4395 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4396 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4397 // If we have an SGPR input, we will violate the constant bus restriction.
4398 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4399 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4400 MachineInstr *DefMI;
4401
4402 int64_t Imm;
4403 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4404 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4405 if (pseudoToMCOpcode(NewOpc) != -1) {
4406 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4407 .add(*Dst)
4408 .add(*Src0)
4409 .add(*Src1)
4410 .addImm(Imm)
4411 .setMIFlags(MI.getFlags());
4412 U.RemoveMIUse = DefMI;
4413 return MIB;
4414 }
4415 }
4416 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4417 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4418 if (pseudoToMCOpcode(NewOpc) != -1) {
4419 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4420 .add(*Dst)
4421 .add(*Src0)
4422 .addImm(Imm)
4423 .add(*Src2)
4424 .setMIFlags(MI.getFlags());
4425 U.RemoveMIUse = DefMI;
4426 return MIB;
4427 }
4428 }
4429 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4430 if (Src0Literal) {
4431 Imm = Src0->getImm();
4432 DefMI = nullptr;
4433 }
4434 if (pseudoToMCOpcode(NewOpc) != -1 &&
4436 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4437 Src1)) {
4438 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4439 .add(*Dst)
4440 .add(*Src1)
4441 .addImm(Imm)
4442 .add(*Src2)
4443 .setMIFlags(MI.getFlags());
4444 U.RemoveMIUse = DefMI;
4445 return MIB;
4446 }
4447 }
4448 }
4449
4450 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4451 // if VOP3 does not allow a literal operand.
4452 if (Src0Literal && !ST.hasVOP3Literal())
4453 return nullptr;
4454
4455 unsigned NewOpc = getNewFMAInst(ST, Opc);
4456
4457 if (pseudoToMCOpcode(NewOpc) == -1)
4458 return nullptr;
4459
4460 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4461 .add(*Dst)
4462 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4463 .add(*Src0)
4464 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4465 .add(*Src1)
4466 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4467 .add(*Src2)
4468 .addImm(Clamp ? Clamp->getImm() : 0)
4469 .addImm(Omod ? Omod->getImm() : 0)
4470 .setMIFlags(MI.getFlags());
4471 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4472 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4473 return MIB;
4474}
4475
4476// It's not generally safe to move VALU instructions across these since it will
4477// start using the register as a base index rather than directly.
4478// XXX - Why isn't hasSideEffects sufficient for these?
4480 switch (MI.getOpcode()) {
4481 case AMDGPU::S_SET_GPR_IDX_ON:
4482 case AMDGPU::S_SET_GPR_IDX_MODE:
4483 case AMDGPU::S_SET_GPR_IDX_OFF:
4484 return true;
4485 default:
4486 return false;
4487 }
4488}
4489
4491 const MachineBasicBlock *MBB,
4492 const MachineFunction &MF) const {
4493 // Skipping the check for SP writes in the base implementation. The reason it
4494 // was added was apparently due to compile time concerns.
4495 //
4496 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4497 // but is probably avoidable.
4498
4499 // Copied from base implementation.
4500 // Terminators and labels can't be scheduled around.
4501 if (MI.isTerminator() || MI.isPosition())
4502 return true;
4503
4504 // INLINEASM_BR can jump to another block
4505 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4506 return true;
4507
4508 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4509 return true;
4510
4511 // Target-independent instructions do not have an implicit-use of EXEC, even
4512 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4513 // boundaries prevents incorrect movements of such instructions.
4514 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4515 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4516 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4517 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4518 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4520}
4521
4523 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4524 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4525 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4526}
4527
4529 // Instructions that access scratch use FLAT encoding or BUF encodings.
4530 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4531 return false;
4532
4533 // SCRATCH instructions always access scratch.
4534 if (isFLATScratch(MI))
4535 return true;
4536
4537 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4538 // via the aperture.
4539 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4540 return false;
4541
4542 // If there are no memory operands then conservatively assume the flat
4543 // operation may access scratch.
4544 if (MI.memoperands_empty())
4545 return true;
4546
4547 // See if any memory operand specifies an address space that involves scratch.
4548 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4549 unsigned AS = Memop->getAddrSpace();
4550 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4551 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4552 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4553 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4554 }
4555 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4556 });
4557}
4558
4560 assert(isFLAT(MI));
4561
4562 // All flat instructions use the VMEM counter except prefetch.
4563 if (!usesVM_CNT(MI))
4564 return false;
4565
4566 // If there are no memory operands then conservatively assume the flat
4567 // operation may access VMEM.
4568 if (MI.memoperands_empty())
4569 return true;
4570
4571 // See if any memory operand specifies an address space that involves VMEM.
4572 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4573 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4574 // (GDS) address space is not supported by flat operations. Therefore, simply
4575 // return true unless only the LDS address space is found.
4576 for (const MachineMemOperand *Memop : MI.memoperands()) {
4577 unsigned AS = Memop->getAddrSpace();
4579 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4580 return true;
4581 }
4582
4583 return false;
4584}
4585
4587 assert(isFLAT(MI));
4588
4589 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4590 if (!usesLGKM_CNT(MI))
4591 return false;
4592
4593 // If in tgsplit mode then there can be no use of LDS.
4594 if (ST.isTgSplitEnabled())
4595 return false;
4596
4597 // If there are no memory operands then conservatively assume the flat
4598 // operation may access LDS.
4599 if (MI.memoperands_empty())
4600 return true;
4601
4602 // See if any memory operand specifies an address space that involves LDS.
4603 for (const MachineMemOperand *Memop : MI.memoperands()) {
4604 unsigned AS = Memop->getAddrSpace();
4606 return true;
4607 }
4608
4609 return false;
4610}
4611
4613 // Skip the full operand and register alias search modifiesRegister
4614 // does. There's only a handful of instructions that touch this, it's only an
4615 // implicit def, and doesn't alias any other registers.
4616 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4617}
4618
4620 unsigned Opcode = MI.getOpcode();
4621
4622 if (MI.mayStore() && isSMRD(MI))
4623 return true; // scalar store or atomic
4624
4625 // This will terminate the function when other lanes may need to continue.
4626 if (MI.isReturn())
4627 return true;
4628
4629 // These instructions cause shader I/O that may cause hardware lockups
4630 // when executed with an empty EXEC mask.
4631 //
4632 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4633 // EXEC = 0, but checking for that case here seems not worth it
4634 // given the typical code patterns.
4635 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4636 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4637 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
4638 Opcode == AMDGPU::S_SETHALT)
4639 return true;
4640
4641 if (MI.isCall() || MI.isInlineAsm())
4642 return true; // conservative assumption
4643
4644 // Assume that barrier interactions are only intended with active lanes.
4645 if (isBarrier(Opcode))
4646 return true;
4647
4648 // A mode change is a scalar operation that influences vector instructions.
4650 return true;
4651
4652 // These are like SALU instructions in terms of effects, so it's questionable
4653 // whether we should return true for those.
4654 //
4655 // However, executing them with EXEC = 0 causes them to operate on undefined
4656 // data, which we avoid by returning true here.
4657 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4658 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4659 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4660 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4661 return true;
4662
4663 return false;
4664}
4665
4667 const MachineInstr &MI) const {
4668 if (MI.isMetaInstruction())
4669 return false;
4670
4671 // This won't read exec if this is an SGPR->SGPR copy.
4672 if (MI.isCopyLike()) {
4673 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4674 return true;
4675
4676 // Make sure this isn't copying exec as a normal operand
4677 return MI.readsRegister(AMDGPU::EXEC, &RI);
4678 }
4679
4680 // Make a conservative assumption about the callee.
4681 if (MI.isCall())
4682 return true;
4683
4684 // Be conservative with any unhandled generic opcodes.
4685 if (!isTargetSpecificOpcode(MI.getOpcode()))
4686 return true;
4687
4688 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4689}
4690
4691bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4692 switch (Imm.getBitWidth()) {
4693 case 1: // This likely will be a condition code mask.
4694 return true;
4695
4696 case 32:
4697 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4698 ST.hasInv2PiInlineImm());
4699 case 64:
4700 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4701 ST.hasInv2PiInlineImm());
4702 case 16:
4703 return ST.has16BitInsts() &&
4704 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4705 ST.hasInv2PiInlineImm());
4706 default:
4707 llvm_unreachable("invalid bitwidth");
4708 }
4709}
4710
4712 APInt IntImm = Imm.bitcastToAPInt();
4713 int64_t IntImmVal = IntImm.getSExtValue();
4714 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4715 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4716 default:
4717 llvm_unreachable("invalid fltSemantics");
4720 return isInlineConstant(IntImm);
4722 return ST.has16BitInsts() &&
4723 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4725 return ST.has16BitInsts() &&
4726 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4727 }
4728}
4729
4730bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4731 // MachineOperand provides no way to tell the true operand size, since it only
4732 // records a 64-bit value. We need to know the size to determine if a 32-bit
4733 // floating point immediate bit pattern is legal for an integer immediate. It
4734 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4735 switch (OperandType) {
4745 int32_t Trunc = static_cast<int32_t>(Imm);
4746 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4747 }
4753 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4756 // We would expect inline immediates to not be concerned with an integer/fp
4757 // distinction. However, in the case of 16-bit integer operations, the
4758 // "floating point" values appear to not work. It seems read the low 16-bits
4759 // of 32-bit immediates, which happens to always work for the integer
4760 // values.
4761 //
4762 // See llvm bugzilla 46302.
4763 //
4764 // TODO: Theoretically we could use op-sel to use the high bits of the
4765 // 32-bit FP values.
4774 return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
4779 return false;
4782 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4783 // A few special case instructions have 16-bit operands on subtargets
4784 // where 16-bit instructions are not legal.
4785 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4786 // constants in these cases
4787 int16_t Trunc = static_cast<int16_t>(Imm);
4788 return ST.has16BitInsts() &&
4789 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4790 }
4791
4792 return false;
4793 }
4796 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4797 int16_t Trunc = static_cast<int16_t>(Imm);
4798 return ST.has16BitInsts() &&
4799 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4800 }
4801 return false;
4802 }
4806 return false;
4808 return isLegalAV64PseudoImm(Imm);
4811 // Always embedded in the instruction for free.
4812 return true;
4822 // Just ignore anything else.
4823 return true;
4824 default:
4825 llvm_unreachable("invalid operand type");
4826 }
4827}
4828
4829static bool compareMachineOp(const MachineOperand &Op0,
4830 const MachineOperand &Op1) {
4831 if (Op0.getType() != Op1.getType())
4832 return false;
4833
4834 switch (Op0.getType()) {
4836 return Op0.getReg() == Op1.getReg();
4838 return Op0.getImm() == Op1.getImm();
4839 default:
4840 llvm_unreachable("Didn't expect to be comparing these operand types");
4841 }
4842}
4843
4845 const MCOperandInfo &OpInfo) const {
4846 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4847 return true;
4848
4849 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4850 return false;
4851
4852 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4853 return true;
4854
4855 return ST.hasVOP3Literal();
4856}
4857
4858bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4859 int64_t ImmVal) const {
4860 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4861 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4862 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4863 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4864 AMDGPU::OpName::src2))
4865 return false;
4866 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4867 }
4868
4869 return isLiteralOperandLegal(InstDesc, OpInfo);
4870}
4871
4872bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4873 const MachineOperand &MO) const {
4874 if (MO.isImm())
4875 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4876
4877 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4878 "unexpected imm-like operand kind");
4879 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4880 return isLiteralOperandLegal(InstDesc, OpInfo);
4881}
4882
4884 // 2 32-bit inline constants packed into one.
4885 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4886 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4887}
4888
4889bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4890 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4891 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4892 return false;
4893
4894 int Op32 = AMDGPU::getVOPe32(Opcode);
4895 if (Op32 == -1)
4896 return false;
4897
4898 return pseudoToMCOpcode(Op32) != -1;
4899}
4900
4901bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4902 // The src0_modifier operand is present on all instructions
4903 // that have modifiers.
4904
4905 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4906}
4907
4909 AMDGPU::OpName OpName) const {
4910 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4911 return Mods && Mods->getImm();
4912}
4913
4915 return any_of(ModifierOpNames,
4916 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4917}
4918
4920 const MachineRegisterInfo &MRI) const {
4921 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4922 // Can't shrink instruction with three operands.
4923 if (Src2) {
4924 switch (MI.getOpcode()) {
4925 default: return false;
4926
4927 case AMDGPU::V_ADDC_U32_e64:
4928 case AMDGPU::V_SUBB_U32_e64:
4929 case AMDGPU::V_SUBBREV_U32_e64: {
4930 const MachineOperand *Src1
4931 = getNamedOperand(MI, AMDGPU::OpName::src1);
4932 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4933 return false;
4934 // Additional verification is needed for sdst/src2.
4935 return true;
4936 }
4937 case AMDGPU::V_MAC_F16_e64:
4938 case AMDGPU::V_MAC_F32_e64:
4939 case AMDGPU::V_MAC_LEGACY_F32_e64:
4940 case AMDGPU::V_FMAC_F16_e64:
4941 case AMDGPU::V_FMAC_F16_t16_e64:
4942 case AMDGPU::V_FMAC_F16_fake16_e64:
4943 case AMDGPU::V_FMAC_F32_e64:
4944 case AMDGPU::V_FMAC_F64_e64:
4945 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4946 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4947 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4948 return false;
4949 break;
4950
4951 case AMDGPU::V_CNDMASK_B32_e64:
4952 break;
4953 }
4954 }
4955
4956 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4957 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4958 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4959 return false;
4960
4961 // We don't need to check src0, all input types are legal, so just make sure
4962 // src0 isn't using any modifiers.
4963 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4964 return false;
4965
4966 // Can it be shrunk to a valid 32 bit opcode?
4967 if (!hasVALU32BitEncoding(MI.getOpcode()))
4968 return false;
4969
4970 // Check output modifiers
4971 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4972 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4973 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4974 // TODO: Can we avoid checking bound_ctrl/fi here?
4975 // They are only used by permlane*_swap special case.
4976 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4977 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4978}
4979
4980// Set VCC operand with all flags from \p Orig, except for setting it as
4981// implicit.
4983 const MachineOperand &Orig) {
4984
4985 for (MachineOperand &Use : MI.implicit_operands()) {
4986 if (Use.isUse() &&
4987 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4988 Use.setIsUndef(Orig.isUndef());
4989 Use.setIsKill(Orig.isKill());
4990 return;
4991 }
4992 }
4993}
4994
4996 unsigned Op32) const {
4997 MachineBasicBlock *MBB = MI.getParent();
4998
4999 const MCInstrDesc &Op32Desc = get(Op32);
5000 MachineInstrBuilder Inst32 =
5001 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
5002 .setMIFlags(MI.getFlags());
5003
5004 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
5005 // For VOPC instructions, this is replaced by an implicit def of vcc.
5006
5007 // We assume the defs of the shrunk opcode are in the same order, and the
5008 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
5009 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
5010 Inst32.add(MI.getOperand(I));
5011
5012 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
5013
5014 int Idx = MI.getNumExplicitDefs();
5015 for (const MachineOperand &Use : MI.explicit_uses()) {
5016 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
5018 continue;
5019
5020 if (&Use == Src2) {
5021 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
5022 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
5023 // replaced with an implicit read of vcc or vcc_lo. The implicit read
5024 // of vcc was already added during the initial BuildMI, but we
5025 // 1) may need to change vcc to vcc_lo to preserve the original register
5026 // 2) have to preserve the original flags.
5027 copyFlagsToImplicitVCC(*Inst32, *Src2);
5028 continue;
5029 }
5030 }
5031
5032 Inst32.add(Use);
5033 }
5034
5035 // FIXME: Losing implicit operands
5036 fixImplicitOperands(*Inst32);
5037 return Inst32;
5038}
5039
5041 // Null is free
5042 Register Reg = RegOp.getReg();
5043 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
5044 return false;
5045
5046 // SGPRs use the constant bus
5047
5048 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5049 // physical register operands should also count, except for exec.
5050 if (RegOp.isImplicit())
5051 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5052
5053 // SGPRs use the constant bus
5054 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5055 AMDGPU::SReg_64RegClass.contains(Reg);
5056}
5057
5059 const MachineRegisterInfo &MRI) const {
5060 Register Reg = RegOp.getReg();
5061 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5062 : physRegUsesConstantBus(RegOp);
5063}
5064
5066 const MachineOperand &MO,
5067 const MCOperandInfo &OpInfo) const {
5068 // Literal constants use the constant bus.
5069 if (!MO.isReg())
5070 return !isInlineConstant(MO, OpInfo);
5071
5072 Register Reg = MO.getReg();
5073 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5075}
5076
5078 for (const MachineOperand &MO : MI.implicit_operands()) {
5079 // We only care about reads.
5080 if (MO.isDef())
5081 continue;
5082
5083 switch (MO.getReg()) {
5084 case AMDGPU::VCC:
5085 case AMDGPU::VCC_LO:
5086 case AMDGPU::VCC_HI:
5087 case AMDGPU::M0:
5088 case AMDGPU::FLAT_SCR:
5089 return MO.getReg();
5090
5091 default:
5092 break;
5093 }
5094 }
5095
5096 return Register();
5097}
5098
5099static bool shouldReadExec(const MachineInstr &MI) {
5100 if (SIInstrInfo::isVALU(MI)) {
5101 switch (MI.getOpcode()) {
5102 case AMDGPU::V_READLANE_B32:
5103 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5104 case AMDGPU::V_WRITELANE_B32:
5105 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5106 return false;
5107 }
5108
5109 return true;
5110 }
5111
5112 if (MI.isPreISelOpcode() ||
5113 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5116 return false;
5117
5118 return true;
5119}
5120
5121static bool isRegOrFI(const MachineOperand &MO) {
5122 return MO.isReg() || MO.isFI();
5123}
5124
5125static bool isSubRegOf(const SIRegisterInfo &TRI,
5126 const MachineOperand &SuperVec,
5127 const MachineOperand &SubReg) {
5128 if (SubReg.getReg().isPhysical())
5129 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5130
5131 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5132 SubReg.getReg() == SuperVec.getReg();
5133}
5134
5135// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5136bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5137 const MachineRegisterInfo &MRI,
5138 StringRef &ErrInfo) const {
5139 Register DstReg = MI.getOperand(0).getReg();
5140 Register SrcReg = MI.getOperand(1).getReg();
5141 // This is a check for copy from vector register to SGPR
5142 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5143 ErrInfo = "illegal copy from vector register to SGPR";
5144 return false;
5145 }
5146 return true;
5147}
5148
5150 StringRef &ErrInfo) const {
5151 uint32_t Opcode = MI.getOpcode();
5152 const MachineFunction *MF = MI.getMF();
5153 const MachineRegisterInfo &MRI = MF->getRegInfo();
5154
5155 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5156 // Find a better property to recognize the point where instruction selection
5157 // is just done.
5158 // We can only enforce this check after SIFixSGPRCopies pass so that the
5159 // illegal copies are legalized and thereafter we don't expect a pass
5160 // inserting similar copies.
5161 if (!MRI.isSSA() && MI.isCopy())
5162 return verifyCopy(MI, MRI, ErrInfo);
5163
5164 if (SIInstrInfo::isGenericOpcode(Opcode))
5165 return true;
5166
5167 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5168 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5169 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5170 int Src3Idx = -1;
5171 if (Src0Idx == -1) {
5172 // VOPD V_DUAL_* instructions use different operand names.
5173 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5174 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5175 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5176 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5177 }
5178
5179 // Make sure the number of operands is correct.
5180 const MCInstrDesc &Desc = get(Opcode);
5181 if (!Desc.isVariadic() &&
5182 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5183 ErrInfo = "Instruction has wrong number of operands.";
5184 return false;
5185 }
5186
5187 if (MI.isInlineAsm()) {
5188 // Verify register classes for inlineasm constraints.
5189 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5190 I != E; ++I) {
5191 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5192 if (!RC)
5193 continue;
5194
5195 const MachineOperand &Op = MI.getOperand(I);
5196 if (!Op.isReg())
5197 continue;
5198
5199 Register Reg = Op.getReg();
5200 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5201 ErrInfo = "inlineasm operand has incorrect register class.";
5202 return false;
5203 }
5204 }
5205
5206 return true;
5207 }
5208
5209 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5210 ErrInfo = "missing memory operand from image instruction.";
5211 return false;
5212 }
5213
5214 // Make sure the register classes are correct.
5215 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5216 const MachineOperand &MO = MI.getOperand(i);
5217 if (MO.isFPImm()) {
5218 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5219 "all fp values to integers.";
5220 return false;
5221 }
5222
5223 const MCOperandInfo &OpInfo = Desc.operands()[i];
5224 int16_t RegClass = getOpRegClassID(OpInfo);
5225
5226 switch (OpInfo.OperandType) {
5228 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5229 ErrInfo = "Illegal immediate value for operand.";
5230 return false;
5231 }
5232 break;
5242 break;
5244 break;
5245 break;
5259 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5260 ErrInfo = "Illegal immediate value for operand.";
5261 return false;
5262 }
5263 break;
5264 }
5269 if (ST.has64BitLiterals() && Desc.getSize() != 4 && MO.isImm() &&
5270 !isInlineConstant(MI, i) &&
5272 OpInfo.OperandType ==
5274 ErrInfo = "illegal 64-bit immediate value for operand.";
5275 return false;
5276 }
5277 break;
5280 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5281 ErrInfo = "Expected inline constant for operand.";
5282 return false;
5283 }
5284 break;
5287 break;
5292 // Check if this operand is an immediate.
5293 // FrameIndex operands will be replaced by immediates, so they are
5294 // allowed.
5295 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5296 ErrInfo = "Expected immediate, but got non-immediate";
5297 return false;
5298 }
5299 break;
5303 break;
5304 default:
5305 if (OpInfo.isGenericType())
5306 continue;
5307 break;
5308 }
5309
5310 if (!MO.isReg())
5311 continue;
5312 Register Reg = MO.getReg();
5313 if (!Reg)
5314 continue;
5315
5316 // FIXME: Ideally we would have separate instruction definitions with the
5317 // aligned register constraint.
5318 // FIXME: We do not verify inline asm operands, but custom inline asm
5319 // verification is broken anyway
5320 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5321 Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
5322 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5323 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5324 if (const TargetRegisterClass *SubRC =
5325 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5326 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5327 if (RC)
5328 RC = SubRC;
5329 }
5330 }
5331
5332 // Check that this is the aligned version of the class.
5333 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5334 ErrInfo = "Subtarget requires even aligned vector registers";
5335 return false;
5336 }
5337 }
5338
5339 if (RegClass != -1) {
5340 if (Reg.isVirtual())
5341 continue;
5342
5343 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5344 if (!RC->contains(Reg)) {
5345 ErrInfo = "Operand has incorrect register class.";
5346 return false;
5347 }
5348 }
5349 }
5350
5351 // Verify SDWA
5352 if (isSDWA(MI)) {
5353 if (!ST.hasSDWA()) {
5354 ErrInfo = "SDWA is not supported on this target";
5355 return false;
5356 }
5357
5358 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5359 AMDGPU::OpName::dst_sel}) {
5360 const MachineOperand *MO = getNamedOperand(MI, Op);
5361 if (!MO)
5362 continue;
5363 int64_t Imm = MO->getImm();
5364 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5365 ErrInfo = "Invalid SDWA selection";
5366 return false;
5367 }
5368 }
5369
5370 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5371
5372 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5373 if (OpIdx == -1)
5374 continue;
5375 const MachineOperand &MO = MI.getOperand(OpIdx);
5376
5377 if (!ST.hasSDWAScalar()) {
5378 // Only VGPRS on VI
5379 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5380 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5381 return false;
5382 }
5383 } else {
5384 // No immediates on GFX9
5385 if (!MO.isReg()) {
5386 ErrInfo =
5387 "Only reg allowed as operands in SDWA instructions on GFX9+";
5388 return false;
5389 }
5390 }
5391 }
5392
5393 if (!ST.hasSDWAOmod()) {
5394 // No omod allowed on VI
5395 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5396 if (OMod != nullptr &&
5397 (!OMod->isImm() || OMod->getImm() != 0)) {
5398 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5399 return false;
5400 }
5401 }
5402
5403 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5404 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5405 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5406 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5407 const MachineOperand *Src0ModsMO =
5408 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5409 unsigned Mods = Src0ModsMO->getImm();
5410 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5411 Mods & SISrcMods::SEXT) {
5412 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5413 return false;
5414 }
5415 }
5416
5417 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5418 if (isVOPC(BasicOpcode)) {
5419 if (!ST.hasSDWASdst() && DstIdx != -1) {
5420 // Only vcc allowed as dst on VI for VOPC
5421 const MachineOperand &Dst = MI.getOperand(DstIdx);
5422 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5423 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5424 return false;
5425 }
5426 } else if (!ST.hasSDWAOutModsVOPC()) {
5427 // No clamp allowed on GFX9 for VOPC
5428 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5429 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5430 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5431 return false;
5432 }
5433
5434 // No omod allowed on GFX9 for VOPC
5435 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5436 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5437 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5438 return false;
5439 }
5440 }
5441 }
5442
5443 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5444 if (DstUnused && DstUnused->isImm() &&
5445 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5446 const MachineOperand &Dst = MI.getOperand(DstIdx);
5447 if (!Dst.isReg() || !Dst.isTied()) {
5448 ErrInfo = "Dst register should have tied register";
5449 return false;
5450 }
5451
5452 const MachineOperand &TiedMO =
5453 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5454 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5455 ErrInfo =
5456 "Dst register should be tied to implicit use of preserved register";
5457 return false;
5458 }
5459 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5460 ErrInfo = "Dst register should use same physical register as preserved";
5461 return false;
5462 }
5463 }
5464 }
5465
5466 if (isDPP(MI) && !ST.hasDPPSrc1SGPR() && Src1Idx != -1) {
5467 const MachineOperand &Src1MO = MI.getOperand(Src1Idx);
5468 if (Src1MO.isReg() && RI.isSGPRReg(MRI, Src1MO.getReg())) {
5469 ErrInfo = "DPP src1 cannot be SGPR on this subtarget";
5470 return false;
5471 }
5472 }
5473
5474 // Verify MIMG / VIMAGE / VSAMPLE
5475 if (isImage(Opcode) && !MI.mayStore()) {
5476 // Ensure that the return type used is large enough for all the options
5477 // being used TFE/LWE require an extra result register.
5478 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5479 if (DMask) {
5480 uint64_t DMaskImm = DMask->getImm();
5481 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5482 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5483 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5484 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5485
5486 // Adjust for packed 16 bit values
5487 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5488 RegCount = divideCeil(RegCount, 2);
5489
5490 // Adjust if using LWE or TFE
5491 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5492 RegCount += 1;
5493
5494 const uint32_t DstIdx =
5495 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5496 const MachineOperand &Dst = MI.getOperand(DstIdx);
5497 if (Dst.isReg()) {
5498 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5499 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5500 if (RegCount > DstSize) {
5501 ErrInfo = "Image instruction returns too many registers for dst "
5502 "register class";
5503 return false;
5504 }
5505 }
5506 }
5507 }
5508
5509 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5510 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5511 unsigned ConstantBusCount = 0;
5512 bool UsesLiteral = false;
5513 const MachineOperand *LiteralVal = nullptr;
5514
5515 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5516 if (ImmIdx != -1) {
5517 ++ConstantBusCount;
5518 UsesLiteral = true;
5519 LiteralVal = &MI.getOperand(ImmIdx);
5520 }
5521
5522 SmallVector<Register, 2> SGPRsUsed;
5523 Register SGPRUsed;
5524
5525 // Only look at the true operands. Only a real operand can use the constant
5526 // bus, and we don't want to check pseudo-operands like the source modifier
5527 // flags.
5528 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5529 if (OpIdx == -1)
5530 continue;
5531 const MachineOperand &MO = MI.getOperand(OpIdx);
5532 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5533 if (MO.isReg()) {
5534 SGPRUsed = MO.getReg();
5535 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5536 ++ConstantBusCount;
5537 SGPRsUsed.push_back(SGPRUsed);
5538 }
5539 } else if (!MO.isFI()) { // Treat FI like a register.
5540 if (!UsesLiteral) {
5541 ++ConstantBusCount;
5542 UsesLiteral = true;
5543 LiteralVal = &MO;
5544 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5545 assert(isVOP2(MI) || isVOP3(MI));
5546 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5547 return false;
5548 }
5549 }
5550 }
5551 }
5552
5553 SGPRUsed = findImplicitSGPRRead(MI);
5554 if (SGPRUsed) {
5555 // Implicit uses may safely overlap true operands
5556 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5557 return !RI.regsOverlap(SGPRUsed, SGPR);
5558 })) {
5559 ++ConstantBusCount;
5560 SGPRsUsed.push_back(SGPRUsed);
5561 }
5562 }
5563
5564 // v_writelane_b32 is an exception from constant bus restriction:
5565 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5566 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5567 Opcode != AMDGPU::V_WRITELANE_B32) {
5568 ErrInfo = "VOP* instruction violates constant bus restriction";
5569 return false;
5570 }
5571
5572 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5573 ErrInfo = "VOP3 instruction uses literal";
5574 return false;
5575 }
5576 }
5577
5578 // Special case for writelane - this can break the multiple constant bus rule,
5579 // but still can't use more than one SGPR register
5580 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5581 unsigned SGPRCount = 0;
5582 Register SGPRUsed;
5583
5584 for (int OpIdx : {Src0Idx, Src1Idx}) {
5585 if (OpIdx == -1)
5586 break;
5587
5588 const MachineOperand &MO = MI.getOperand(OpIdx);
5589
5590 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5591 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5592 if (MO.getReg() != SGPRUsed)
5593 ++SGPRCount;
5594 SGPRUsed = MO.getReg();
5595 }
5596 }
5597 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5598 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5599 return false;
5600 }
5601 }
5602 }
5603
5604 // Verify misc. restrictions on specific instructions.
5605 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5606 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5607 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5608 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5609 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5610 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5611 if (!compareMachineOp(Src0, Src1) &&
5612 !compareMachineOp(Src0, Src2)) {
5613 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5614 return false;
5615 }
5616 }
5617 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5618 SISrcMods::ABS) ||
5619 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5620 SISrcMods::ABS) ||
5621 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5622 SISrcMods::ABS)) {
5623 ErrInfo = "ABS not allowed in VOP3B instructions";
5624 return false;
5625 }
5626 }
5627
5628 if (isSOP2(MI) || isSOPC(MI)) {
5629 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5630 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5631
5632 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5633 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5634 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5635 !Src0.isIdenticalTo(Src1)) {
5636 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5637 return false;
5638 }
5639 }
5640
5641 if (isSOPK(MI)) {
5642 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5643 if (Desc.isBranch()) {
5644 if (!Op->isMBB()) {
5645 ErrInfo = "invalid branch target for SOPK instruction";
5646 return false;
5647 }
5648 } else {
5649 uint64_t Imm = Op->getImm();
5650 if (sopkIsZext(Opcode)) {
5651 if (!isUInt<16>(Imm)) {
5652 ErrInfo = "invalid immediate for SOPK instruction";
5653 return false;
5654 }
5655 } else {
5656 if (!isInt<16>(Imm)) {
5657 ErrInfo = "invalid immediate for SOPK instruction";
5658 return false;
5659 }
5660 }
5661 }
5662 }
5663
5664 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5665 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5666 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5667 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5668 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5669 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5670
5671 const unsigned StaticNumOps =
5672 Desc.getNumOperands() + Desc.implicit_uses().size();
5673 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5674
5675 // Require additional implicit operands. This allows a fixup done by the
5676 // post RA scheduler where the main implicit operand is killed and
5677 // implicit-defs are added for sub-registers that remain live after this
5678 // instruction.
5679 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5680 ErrInfo = "missing implicit register operands";
5681 return false;
5682 }
5683
5684 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5685 if (IsDst) {
5686 if (!Dst->isUse()) {
5687 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5688 return false;
5689 }
5690
5691 unsigned UseOpIdx;
5692 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5693 UseOpIdx != StaticNumOps + 1) {
5694 ErrInfo = "movrel implicit operands should be tied";
5695 return false;
5696 }
5697 }
5698
5699 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5700 const MachineOperand &ImpUse
5701 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5702 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5703 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5704 ErrInfo = "src0 should be subreg of implicit vector use";
5705 return false;
5706 }
5707 }
5708
5709 // Make sure we aren't losing exec uses in the td files. This mostly requires
5710 // being careful when using let Uses to try to add other use registers.
5711 if (shouldReadExec(MI)) {
5712 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5713 ErrInfo = "VALU instruction does not implicitly read exec mask";
5714 return false;
5715 }
5716 }
5717
5718 if (isSMRD(MI)) {
5719 if (MI.mayStore() &&
5720 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5721 // The register offset form of scalar stores may only use m0 as the
5722 // soffset register.
5723 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5724 if (Soff && Soff->getReg() != AMDGPU::M0) {
5725 ErrInfo = "scalar stores must use m0 as offset register";
5726 return false;
5727 }
5728 }
5729 }
5730
5731 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5732 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5733 if (Offset->getImm() != 0) {
5734 ErrInfo = "subtarget does not support offsets in flat instructions";
5735 return false;
5736 }
5737 }
5738
5739 if (isDS(MI) && !ST.hasGDS()) {
5740 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5741 if (GDSOp && GDSOp->getImm() != 0) {
5742 ErrInfo = "GDS is not supported on this subtarget";
5743 return false;
5744 }
5745 }
5746
5747 if (isImage(MI)) {
5748 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5749 if (DimOp) {
5750 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5751 AMDGPU::OpName::vaddr0);
5752 AMDGPU::OpName RSrcOpName =
5753 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5754 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5755 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5756 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5757 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5758 const AMDGPU::MIMGDimInfo *Dim =
5760
5761 if (!Dim) {
5762 ErrInfo = "dim is out of range";
5763 return false;
5764 }
5765
5766 bool IsA16 = false;
5767 if (ST.hasR128A16()) {
5768 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5769 IsA16 = R128A16->getImm() != 0;
5770 } else if (ST.hasA16()) {
5771 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5772 IsA16 = A16->getImm() != 0;
5773 }
5774
5775 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5776
5777 unsigned AddrWords =
5778 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5779
5780 unsigned VAddrWords;
5781 if (IsNSA) {
5782 VAddrWords = RsrcIdx - VAddr0Idx;
5783 if (ST.hasPartialNSAEncoding() &&
5784 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5785 unsigned LastVAddrIdx = RsrcIdx - 1;
5786 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5787 }
5788 } else {
5789 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5790 if (AddrWords > 12)
5791 AddrWords = 16;
5792 }
5793
5794 if (VAddrWords != AddrWords) {
5795 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5796 << " but got " << VAddrWords << "\n");
5797 ErrInfo = "bad vaddr size";
5798 return false;
5799 }
5800 }
5801 }
5802
5803 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5804 if (DppCt) {
5805 using namespace AMDGPU::DPP;
5806
5807 unsigned DC = DppCt->getImm();
5808 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5809 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5810 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5811 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5812 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5813 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5814 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5815 ErrInfo = "Invalid dpp_ctrl value";
5816 return false;
5817 }
5818 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5819 !ST.hasDPPWavefrontShifts()) {
5820 ErrInfo = "Invalid dpp_ctrl value: "
5821 "wavefront shifts are not supported on GFX10+";
5822 return false;
5823 }
5824 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5825 !ST.hasDPPBroadcasts()) {
5826 ErrInfo = "Invalid dpp_ctrl value: "
5827 "broadcasts are not supported on GFX10+";
5828 return false;
5829 }
5830 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5831 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5832 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5833 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5834 !ST.hasGFX90AInsts()) {
5835 ErrInfo = "Invalid dpp_ctrl value: "
5836 "row_newbroadcast/row_share is not supported before "
5837 "GFX90A/GFX10";
5838 return false;
5839 }
5840 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5841 ErrInfo = "Invalid dpp_ctrl value: "
5842 "row_share and row_xmask are not supported before GFX10";
5843 return false;
5844 }
5845 }
5846
5847 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5849 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5850 ErrInfo = "Invalid dpp_ctrl value: "
5851 "DP ALU dpp only support row_newbcast";
5852 return false;
5853 }
5854 }
5855
5856 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5857 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5858 AMDGPU::OpName DataName =
5859 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5860 const MachineOperand *Data = getNamedOperand(MI, DataName);
5861 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5862 if (Data && !Data->isReg())
5863 Data = nullptr;
5864
5865 if (ST.hasGFX90AInsts()) {
5866 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5867 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5868 ErrInfo = "Invalid register class: "
5869 "vdata and vdst should be both VGPR or AGPR";
5870 return false;
5871 }
5872 if (Data && Data2 &&
5873 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5874 ErrInfo = "Invalid register class: "
5875 "both data operands should be VGPR or AGPR";
5876 return false;
5877 }
5878 } else {
5879 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5880 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5881 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5882 ErrInfo = "Invalid register class: "
5883 "agpr loads and stores not supported on this GPU";
5884 return false;
5885 }
5886 }
5887 }
5888
5889 if (ST.needsAlignedVGPRs()) {
5890 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5892 if (!Op)
5893 return true;
5894 Register Reg = Op->getReg();
5895 if (Reg.isPhysical())
5896 return !(RI.getHWRegIndex(Reg) & 1);
5897 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5898 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5899 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5900 };
5901
5902 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5903 Opcode == AMDGPU::DS_GWS_BARRIER) {
5904
5905 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5906 ErrInfo = "Subtarget requires even aligned vector registers "
5907 "for DS_GWS instructions";
5908 return false;
5909 }
5910 }
5911
5912 if (isMIMG(MI)) {
5913 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5914 ErrInfo = "Subtarget requires even aligned vector registers "
5915 "for vaddr operand of image instructions";
5916 return false;
5917 }
5918 }
5919 }
5920
5921 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5922 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5923 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5924 ErrInfo = "Invalid register class: "
5925 "v_accvgpr_write with an SGPR is not supported on this GPU";
5926 return false;
5927 }
5928 }
5929
5930 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5931 const MachineOperand &SrcOp = MI.getOperand(1);
5932 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5933 ErrInfo = "pseudo expects only physical SGPRs";
5934 return false;
5935 }
5936 }
5937
5938 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5939 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5940 if (!ST.hasScaleOffset()) {
5941 ErrInfo = "Subtarget does not support offset scaling";
5942 return false;
5943 }
5944 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5945 ErrInfo = "Instruction does not support offset scaling";
5946 return false;
5947 }
5948 }
5949 }
5950
5951 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5952 // information.
5953 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5954 for (unsigned I = 0; I < 3; ++I) {
5956 return false;
5957 }
5958 }
5959
5960 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5961 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5962 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5963 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5964 &AMDGPU::SReg_64RegClass) ||
5965 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5966 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5967 return false;
5968 }
5969 }
5970
5971 return true;
5972}
5973
5975 if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
5976 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5977 return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg())
5978 ? AMDGPU::COPY
5979 : AMDGPU::V_MOV_B32_e32;
5980 }
5981 return getVALUOp(MI.getOpcode());
5982}
5983
5984// It is more readable to list mapped opcodes on the same line.
5985// clang-format off
5986
5987unsigned SIInstrInfo::getVALUOp(unsigned Opc) const {
5988 switch (Opc) {
5989 default: return AMDGPU::INSTRUCTION_LIST_END;
5990 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5991 case AMDGPU::COPY: return AMDGPU::COPY;
5992 case AMDGPU::PHI: return AMDGPU::PHI;
5993 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5994 case AMDGPU::WQM: return AMDGPU::WQM;
5995 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5996 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5997 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5998 case AMDGPU::S_ADD_I32:
5999 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
6000 case AMDGPU::S_ADDC_U32:
6001 return AMDGPU::V_ADDC_U32_e32;
6002 case AMDGPU::S_SUB_I32:
6003 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
6004 // FIXME: These are not consistently handled, and selected when the carry is
6005 // used.
6006 case AMDGPU::S_ADD_U32:
6007 return AMDGPU::V_ADD_CO_U32_e32;
6008 case AMDGPU::S_SUB_U32:
6009 return AMDGPU::V_SUB_CO_U32_e32;
6010 case AMDGPU::S_ADD_U64_PSEUDO:
6011 return AMDGPU::V_ADD_U64_PSEUDO;
6012 case AMDGPU::S_SUB_U64_PSEUDO:
6013 return AMDGPU::V_SUB_U64_PSEUDO;
6014 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
6015 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
6016 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
6017 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
6018 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
6019 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
6020 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
6021 case AMDGPU::S_XNOR_B32:
6022 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
6023 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
6024 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
6025 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
6026 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
6027 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
6028 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
6029 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
6030 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
6031 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
6032 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
6033 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
6034 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
6035 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
6036 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
6037 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
6038 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
6039 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
6040 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
6041 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
6042 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
6043 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
6044 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
6045 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
6046 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
6047 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
6048 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
6049 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
6050 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
6051 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
6052 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
6053 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
6054 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
6055 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
6056 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
6057 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
6058 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
6059 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
6060 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
6061 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6062 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6063 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6064 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6065 case AMDGPU::S_CVT_F32_F16:
6066 case AMDGPU::S_CVT_HI_F32_F16:
6067 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6068 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6069 case AMDGPU::S_CVT_F16_F32:
6070 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6071 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6072 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6073 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6074 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6075 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6076 case AMDGPU::S_CEIL_F16:
6077 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6078 : AMDGPU::V_CEIL_F16_fake16_e64;
6079 case AMDGPU::S_FLOOR_F16:
6080 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6081 : AMDGPU::V_FLOOR_F16_fake16_e64;
6082 case AMDGPU::S_TRUNC_F16:
6083 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6084 : AMDGPU::V_TRUNC_F16_fake16_e64;
6085 case AMDGPU::S_RNDNE_F16:
6086 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6087 : AMDGPU::V_RNDNE_F16_fake16_e64;
6088 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6089 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6090 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6091 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6092 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6093 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6094 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6095 case AMDGPU::S_ADD_F16:
6096 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6097 : AMDGPU::V_ADD_F16_fake16_e64;
6098 case AMDGPU::S_SUB_F16:
6099 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6100 : AMDGPU::V_SUB_F16_fake16_e64;
6101 case AMDGPU::S_MIN_F16:
6102 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6103 : AMDGPU::V_MIN_F16_fake16_e64;
6104 case AMDGPU::S_MAX_F16:
6105 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6106 : AMDGPU::V_MAX_F16_fake16_e64;
6107 case AMDGPU::S_MINIMUM_F16:
6108 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6109 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6110 case AMDGPU::S_MAXIMUM_F16:
6111 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6112 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6113 case AMDGPU::S_MUL_F16:
6114 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6115 : AMDGPU::V_MUL_F16_fake16_e64;
6116 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6117 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6118 case AMDGPU::S_FMAC_F16:
6119 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6120 : AMDGPU::V_FMAC_F16_fake16_e64;
6121 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6122 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6123 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6124 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6125 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6126 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6127 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6128 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6129 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6130 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6131 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6132 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6133 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6134 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6135 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6136 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6137 case AMDGPU::S_CMP_LT_F16:
6138 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6139 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6140 case AMDGPU::S_CMP_EQ_F16:
6141 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6142 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6143 case AMDGPU::S_CMP_LE_F16:
6144 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6145 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6146 case AMDGPU::S_CMP_GT_F16:
6147 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6148 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6149 case AMDGPU::S_CMP_LG_F16:
6150 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6151 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6152 case AMDGPU::S_CMP_GE_F16:
6153 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6154 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6155 case AMDGPU::S_CMP_O_F16:
6156 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6157 : AMDGPU::V_CMP_O_F16_fake16_e64;
6158 case AMDGPU::S_CMP_U_F16:
6159 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6160 : AMDGPU::V_CMP_U_F16_fake16_e64;
6161 case AMDGPU::S_CMP_NGE_F16:
6162 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6163 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6164 case AMDGPU::S_CMP_NLG_F16:
6165 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6166 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6167 case AMDGPU::S_CMP_NGT_F16:
6168 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6169 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6170 case AMDGPU::S_CMP_NLE_F16:
6171 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6172 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6173 case AMDGPU::S_CMP_NEQ_F16:
6174 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6175 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6176 case AMDGPU::S_CMP_NLT_F16:
6177 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6178 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6179 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6180 case AMDGPU::V_S_EXP_F16_e64:
6181 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6182 : AMDGPU::V_EXP_F16_fake16_e64;
6183 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6184 case AMDGPU::V_S_LOG_F16_e64:
6185 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6186 : AMDGPU::V_LOG_F16_fake16_e64;
6187 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6188 case AMDGPU::V_S_RCP_F16_e64:
6189 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6190 : AMDGPU::V_RCP_F16_fake16_e64;
6191 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6192 case AMDGPU::V_S_RSQ_F16_e64:
6193 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6194 : AMDGPU::V_RSQ_F16_fake16_e64;
6195 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6196 case AMDGPU::V_S_SQRT_F16_e64:
6197 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6198 : AMDGPU::V_SQRT_F16_fake16_e64;
6199 }
6201 "Unexpected scalar opcode without corresponding vector one!");
6202}
6203
6204// clang-format on
6205
6209 const DebugLoc &DL, Register Reg,
6210 bool IsSCCLive,
6211 SlotIndexes *Indexes) const {
6212 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6213 const SIInstrInfo *TII = ST.getInstrInfo();
6215 if (IsSCCLive) {
6216 // Insert two move instructions, one to save the original value of EXEC and
6217 // the other to turn on all bits in EXEC. This is required as we can't use
6218 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6219 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6221 auto FlipExecMI =
6222 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6223 if (Indexes) {
6224 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6225 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6226 }
6227 } else {
6228 auto SaveExec =
6229 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6230 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6231 if (Indexes)
6232 Indexes->insertMachineInstrInMaps(*SaveExec);
6233 }
6234}
6235
6238 const DebugLoc &DL, Register Reg,
6239 SlotIndexes *Indexes) const {
6241 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6242 .addReg(Reg, RegState::Kill);
6243 if (Indexes)
6244 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6245}
6246
6250 "Not a whole wave func");
6251 MachineBasicBlock &MBB = *MF.begin();
6252 for (MachineInstr &MI : MBB)
6253 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6254 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6255 return &MI;
6256
6257 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6258}
6259
6261 unsigned OpNo) const {
6262 const MCInstrDesc &Desc = get(MI.getOpcode());
6263 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6264 Desc.operands()[OpNo].RegClass == -1) {
6265 Register Reg = MI.getOperand(OpNo).getReg();
6266
6267 if (Reg.isVirtual()) {
6268 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6269 return MRI.getRegClass(Reg);
6270 }
6271 return RI.getPhysRegBaseClass(Reg);
6272 }
6273
6274 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6275 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6276}
6277
6280 MachineBasicBlock *MBB = MI.getParent();
6281 MachineOperand &MO = MI.getOperand(OpIdx);
6282 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6283 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6284 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6285 unsigned Size = RI.getRegSizeInBits(*RC);
6286 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6287 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6288 : AMDGPU::V_MOV_B32_e32;
6289 if (MO.isReg())
6290 Opcode = AMDGPU::COPY;
6291 else if (RI.isSGPRClass(RC))
6292 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6293
6294 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6295 Register Reg = MRI.createVirtualRegister(VRC);
6296 DebugLoc DL = MBB->findDebugLoc(I);
6297 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6298 MO.ChangeToRegister(Reg, false);
6299}
6300
6303 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6304 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6305 if (!SuperReg.getReg().isVirtual())
6306 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6307
6308 MachineBasicBlock *MBB = MI->getParent();
6309 const DebugLoc &DL = MI->getDebugLoc();
6310 Register SubReg = MRI.createVirtualRegister(SubRC);
6311
6312 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6313 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6314 .addReg(SuperReg.getReg(), {}, NewSubIdx);
6315 return SubReg;
6316}
6317
6320 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6321 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6322 if (Op.isImm()) {
6323 if (SubIdx == AMDGPU::sub0)
6324 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6325 if (SubIdx == AMDGPU::sub1)
6326 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6327
6328 llvm_unreachable("Unhandled register index for immediate");
6329 }
6330
6331 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6332 SubIdx, SubRC);
6333 return MachineOperand::CreateReg(SubReg, false);
6334}
6335
6336// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6337void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6338 assert(Inst.getNumExplicitOperands() == 3);
6339 MachineOperand Op1 = Inst.getOperand(1);
6340 Inst.removeOperand(1);
6341 Inst.addOperand(Op1);
6342}
6343
6345 const MCOperandInfo &OpInfo,
6346 const MachineOperand &MO) const {
6347 if (!MO.isReg())
6348 return false;
6349
6350 Register Reg = MO.getReg();
6351
6352 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6353 if (Reg.isPhysical())
6354 return DRC->contains(Reg);
6355
6356 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6357
6358 if (MO.getSubReg()) {
6359 const MachineFunction *MF = MO.getParent()->getMF();
6360 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6361 if (!SuperRC)
6362 return false;
6363 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6364 }
6365
6366 return RI.getCommonSubClass(DRC, RC) != nullptr;
6367}
6368
6370 const MachineOperand &MO) const {
6371 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6372 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6373 unsigned Opc = MI.getOpcode();
6374
6375 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6376 // information.
6377 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6378 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6379 constexpr AMDGPU::OpName OpNames[] = {
6380 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6381
6382 for (auto [I, OpName] : enumerate(OpNames)) {
6383 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6384 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6386 return false;
6387 }
6388 }
6389
6390 if (!isLegalRegOperand(MRI, OpInfo, MO))
6391 return false;
6392
6393 // check Accumulate GPR operand
6394 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6395 if (IsAGPR && !ST.hasMAIInsts())
6396 return false;
6397 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6398 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6399 return false;
6400 // Atomics should have both vdst and vdata either vgpr or agpr.
6401 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6402 const int DataIdx = AMDGPU::getNamedOperandIdx(
6403 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6404 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6405 MI.getOperand(DataIdx).isReg() &&
6406 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6407 return false;
6408 if ((int)OpIdx == DataIdx) {
6409 if (VDstIdx != -1 &&
6410 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6411 return false;
6412 // DS instructions with 2 src operands also must have tied RC.
6413 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6414 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6415 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6416 return false;
6417 }
6418
6419 // Check V_ACCVGPR_WRITE_B32_e64
6420 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6421 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6422 RI.isSGPRReg(MRI, MO.getReg()))
6423 return false;
6424
6425 if (ST.hasFlatScratchHiInB64InstHazard() &&
6426 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6427 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6428 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6429 64)
6430 return false;
6431 }
6432 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6433 return false;
6434 }
6435 if (!ST.hasDPPSrc1SGPR() && isDPP(MI) && RI.isSGPRReg(MRI, MO.getReg()) &&
6436 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1))
6437 return false;
6438
6439 return true;
6440}
6441
6443 const MCOperandInfo &OpInfo,
6444 const MachineOperand &MO) const {
6445 if (MO.isReg())
6446 return isLegalRegOperand(MRI, OpInfo, MO);
6447
6448 // Handle non-register types that are treated like immediates.
6449 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6450 return true;
6451}
6452
6454 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6455 const MachineOperand *MO) const {
6456 constexpr unsigned NumOps = 3;
6457 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6458 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6459 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6460 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6461
6462 assert(SrcN < NumOps);
6463
6464 if (!MO) {
6465 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6466 if (SrcIdx == -1)
6467 return true;
6468 MO = &MI.getOperand(SrcIdx);
6469 }
6470
6471 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6472 return true;
6473
6474 int ModsIdx =
6475 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6476 if (ModsIdx == -1)
6477 return true;
6478
6479 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6480 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6481 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6482
6483 return !OpSel && !OpSelHi;
6484}
6485
6487 const MachineOperand *MO) const {
6488 const MachineFunction &MF = *MI.getMF();
6489 const MachineRegisterInfo &MRI = MF.getRegInfo();
6490 const MCInstrDesc &InstDesc = MI.getDesc();
6491 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6492 int64_t RegClass = getOpRegClassID(OpInfo);
6493 const TargetRegisterClass *DefinedRC =
6494 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6495 if (!MO)
6496 MO = &MI.getOperand(OpIdx);
6497
6498 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6499
6500 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6501 const MachineOperand *UsedLiteral = nullptr;
6502
6503 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6504 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6505
6506 // TODO: Be more permissive with frame indexes.
6507 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6508 if (!LiteralLimit--)
6509 return false;
6510
6511 UsedLiteral = MO;
6512 }
6513
6515 if (MO->isReg())
6516 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6517
6518 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6519 if (i == OpIdx)
6520 continue;
6521 const MachineOperand &Op = MI.getOperand(i);
6522 if (Op.isReg()) {
6523 if (Op.isUse()) {
6524 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6525 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6526 if (--ConstantBusLimit <= 0)
6527 return false;
6528 }
6529 }
6530 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6531 !isInlineConstant(Op, InstDesc.operands()[i])) {
6532 // The same literal may be used multiple times.
6533 if (!UsedLiteral)
6534 UsedLiteral = &Op;
6535 else if (UsedLiteral->isIdenticalTo(Op))
6536 continue;
6537
6538 if (!LiteralLimit--)
6539 return false;
6540 if (--ConstantBusLimit <= 0)
6541 return false;
6542 }
6543 }
6544 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6545 // There can be at most one literal operand, but it can be repeated.
6546 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6547 if (i == OpIdx)
6548 continue;
6549 const MachineOperand &Op = MI.getOperand(i);
6550 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6551 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6552 !Op.isIdenticalTo(*MO))
6553 return false;
6554
6555 // Do not fold a non-inlineable and non-register operand into an
6556 // instruction that already has a frame index. The frame index handling
6557 // code could not handle well when a frame index co-exists with another
6558 // non-register operand, unless that operand is an inlineable immediate.
6559 if (Op.isFI())
6560 return false;
6561 }
6562 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6563 isF16PseudoScalarTrans(MI.getOpcode())) {
6564 return false;
6565 }
6566
6567 if (MO->isReg()) {
6568 if (!DefinedRC)
6569 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6570 return isLegalRegOperand(MI, OpIdx, *MO);
6571 }
6572
6573 if (MO->isImm()) {
6574 uint64_t Imm = MO->getImm();
6575 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6576 bool Is64BitOp = Is64BitFPOp ||
6577 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6578 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6579 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6580 if (Is64BitOp &&
6581 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6582 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6583 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6584 return false;
6585
6586 // FIXME: We can use sign extended 64-bit literals, but only for signed
6587 // operands. At the moment we do not know if an operand is signed.
6588 // Such operand will be encoded as its low 32 bits and then either
6589 // correctly sign extended or incorrectly zero extended by HW.
6590 // If 64-bit literals are supported and the literal will be encoded
6591 // as full 64 bit we still can use it.
6592 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6593 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6594 return false;
6595 }
6596 }
6597
6598 // Handle non-register types that are treated like immediates.
6599 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6600
6601 if (!DefinedRC) {
6602 // This operand expects an immediate.
6603 return true;
6604 }
6605
6606 return isImmOperandLegal(MI, OpIdx, *MO);
6607}
6608
6610 bool IsGFX950Only = ST.hasGFX950Insts();
6611 bool IsGFX940Only = ST.hasGFX940Insts();
6612
6613 if (!IsGFX950Only && !IsGFX940Only)
6614 return false;
6615
6616 if (!isVALU(MI))
6617 return false;
6618
6619 // V_COS, V_EXP, V_RCP, etc.
6620 if (isTRANS(MI))
6621 return true;
6622
6623 // DOT2, DOT2C, DOT4, etc.
6624 if (isDOT(MI))
6625 return true;
6626
6627 // MFMA, SMFMA
6628 if (isMFMA(MI))
6629 return true;
6630
6631 unsigned Opcode = MI.getOpcode();
6632 switch (Opcode) {
6633 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6634 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6635 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6636 case AMDGPU::V_MQSAD_U32_U8_e64:
6637 case AMDGPU::V_PK_ADD_F16:
6638 case AMDGPU::V_PK_ADD_F32:
6639 case AMDGPU::V_PK_ADD_I16:
6640 case AMDGPU::V_PK_ADD_U16:
6641 case AMDGPU::V_PK_ASHRREV_I16:
6642 case AMDGPU::V_PK_FMA_F16:
6643 case AMDGPU::V_PK_FMA_F32:
6644 case AMDGPU::V_PK_FMAC_F16_e32:
6645 case AMDGPU::V_PK_FMAC_F16_e64:
6646 case AMDGPU::V_PK_LSHLREV_B16:
6647 case AMDGPU::V_PK_LSHRREV_B16:
6648 case AMDGPU::V_PK_MAD_I16:
6649 case AMDGPU::V_PK_MAD_U16:
6650 case AMDGPU::V_PK_MAX_F16:
6651 case AMDGPU::V_PK_MAX_I16:
6652 case AMDGPU::V_PK_MAX_U16:
6653 case AMDGPU::V_PK_MIN_F16:
6654 case AMDGPU::V_PK_MIN_I16:
6655 case AMDGPU::V_PK_MIN_U16:
6656 case AMDGPU::V_PK_MOV_B32:
6657 case AMDGPU::V_PK_MUL_F16:
6658 case AMDGPU::V_PK_MUL_F32:
6659 case AMDGPU::V_PK_MUL_LO_U16:
6660 case AMDGPU::V_PK_SUB_I16:
6661 case AMDGPU::V_PK_SUB_U16:
6662 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6663 return true;
6664 default:
6665 return false;
6666 }
6667}
6668
6670 MachineInstr &MI) const {
6671 unsigned Opc = MI.getOpcode();
6672 const MCInstrDesc &InstrDesc = get(Opc);
6673
6674 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6675 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6676
6677 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6678 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6679
6680 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6681 // we need to only have one constant bus use before GFX10.
6682 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6683 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6684 RI.isSGPRReg(MRI, Src0.getReg()))
6685 legalizeOpWithMove(MI, Src0Idx);
6686
6687 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6688 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6689 // src0/src1 with V_READFIRSTLANE.
6690 if (Opc == AMDGPU::V_WRITELANE_B32) {
6691 const DebugLoc &DL = MI.getDebugLoc();
6692 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6693 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6694 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6695 .add(Src0);
6696 Src0.ChangeToRegister(Reg, false);
6697 }
6698 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6699 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6700 const DebugLoc &DL = MI.getDebugLoc();
6701 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6702 .add(Src1);
6703 Src1.ChangeToRegister(Reg, false);
6704 }
6705 return;
6706 }
6707
6708 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6709 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6710 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6711 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6712 legalizeOpWithMove(MI, Src2Idx);
6713 }
6714
6715 // VOP2 src0 instructions support all operand types, so we don't need to check
6716 // their legality. If src1 is already legal, we don't need to do anything.
6717 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6718 return;
6719
6720 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6721 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6722 // select is uniform.
6723 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6724 RI.isVGPR(MRI, Src1.getReg())) {
6725 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6726 const DebugLoc &DL = MI.getDebugLoc();
6727 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6728 .add(Src1);
6729 Src1.ChangeToRegister(Reg, false);
6730 return;
6731 }
6732
6733 // We do not use commuteInstruction here because it is too aggressive and will
6734 // commute if it is possible. We only want to commute here if it improves
6735 // legality. This can be called a fairly large number of times so don't waste
6736 // compile time pointlessly swapping and checking legality again.
6737 if (HasImplicitSGPR || !MI.isCommutable()) {
6738 legalizeOpWithMove(MI, Src1Idx);
6739 return;
6740 }
6741
6742 // If src0 can be used as src1, commuting will make the operands legal.
6743 // Otherwise we have to give up and insert a move.
6744 //
6745 // TODO: Other immediate-like operand kinds could be commuted if there was a
6746 // MachineOperand::ChangeTo* for them.
6747 if ((!Src1.isImm() && !Src1.isReg()) ||
6748 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6749 legalizeOpWithMove(MI, Src1Idx);
6750 return;
6751 }
6752
6753 int CommutedOpc = commuteOpcode(MI);
6754 if (CommutedOpc == -1) {
6755 legalizeOpWithMove(MI, Src1Idx);
6756 return;
6757 }
6758
6759 MI.setDesc(get(CommutedOpc));
6760
6761 Register Src0Reg = Src0.getReg();
6762 unsigned Src0SubReg = Src0.getSubReg();
6763 bool Src0Kill = Src0.isKill();
6764
6765 if (Src1.isImm())
6766 Src0.ChangeToImmediate(Src1.getImm());
6767 else if (Src1.isReg()) {
6768 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6769 Src0.setSubReg(Src1.getSubReg());
6770 } else
6771 llvm_unreachable("Should only have register or immediate operands");
6772
6773 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6774 Src1.setSubReg(Src0SubReg);
6776}
6777
6778// Legalize VOP3 operands. All operand types are supported for any operand
6779// but only one literal constant and only starting from GFX10.
6781 MachineInstr &MI) const {
6782 unsigned Opc = MI.getOpcode();
6783
6784 int VOP3Idx[3] = {
6785 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6786 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6787 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6788 };
6789
6790 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6791 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6792 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6793 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6794 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6795 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6796 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6797 // src1 and src2 must be scalar
6798 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6799 const DebugLoc &DL = MI.getDebugLoc();
6800 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6801 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6802 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6803 .add(Src1);
6804 Src1.ChangeToRegister(Reg, false);
6805 }
6806 if (VOP3Idx[2] != -1) {
6807 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6808 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6809 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6810 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6811 .add(Src2);
6812 Src2.ChangeToRegister(Reg, false);
6813 }
6814 }
6815 }
6816
6817 // Find the one SGPR operand we are allowed to use.
6818 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6819 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6820 SmallDenseSet<unsigned> SGPRsUsed;
6821 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6822 if (SGPRReg) {
6823 SGPRsUsed.insert(SGPRReg);
6824 --ConstantBusLimit;
6825 }
6826
6827 for (int Idx : VOP3Idx) {
6828 if (Idx == -1)
6829 break;
6830 MachineOperand &MO = MI.getOperand(Idx);
6831
6832 if (!MO.isReg()) {
6833 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6834 continue;
6835
6836 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6837 --LiteralLimit;
6838 --ConstantBusLimit;
6839 continue;
6840 }
6841
6842 --LiteralLimit;
6843 --ConstantBusLimit;
6844 legalizeOpWithMove(MI, Idx);
6845 continue;
6846 }
6847
6848 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6849 continue; // VGPRs are legal
6850
6851 // We can use one SGPR in each VOP3 instruction prior to GFX10
6852 // and two starting from GFX10.
6853 if (SGPRsUsed.count(MO.getReg()))
6854 continue;
6855 if (ConstantBusLimit > 0) {
6856 SGPRsUsed.insert(MO.getReg());
6857 --ConstantBusLimit;
6858 continue;
6859 }
6860
6861 // If we make it this far, then the operand is not legal and we must
6862 // legalize it.
6863 legalizeOpWithMove(MI, Idx);
6864 }
6865
6866 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6867 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6868 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6869 legalizeOpWithMove(MI, VOP3Idx[2]);
6870
6871 // Fix the register class of packed FP32 instructions on gfx12+. See
6872 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6874 for (unsigned I = 0; I < 3; ++I) {
6875 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
6876 legalizeOpWithMove(MI, VOP3Idx[I]);
6877 }
6878 }
6879}
6880
6883 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6884 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6885 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6886 if (DstRC)
6887 SRC = RI.getCommonSubClass(SRC, DstRC);
6888
6889 Register DstReg = MRI.createVirtualRegister(SRC);
6890 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6891
6892 if (RI.hasAGPRs(VRC)) {
6893 VRC = RI.getEquivalentVGPRClass(VRC);
6894 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6895 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6896 get(TargetOpcode::COPY), NewSrcReg)
6897 .addReg(SrcReg);
6898 SrcReg = NewSrcReg;
6899 }
6900
6901 if (SubRegs == 1) {
6902 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6903 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6904 .addReg(SrcReg);
6905 return DstReg;
6906 }
6907
6909 for (unsigned i = 0; i < SubRegs; ++i) {
6910 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6911 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6912 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6913 .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
6914 SRegs.push_back(SGPR);
6915 }
6916
6918 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6919 get(AMDGPU::REG_SEQUENCE), DstReg);
6920 for (unsigned i = 0; i < SubRegs; ++i) {
6921 MIB.addReg(SRegs[i]);
6922 MIB.addImm(RI.getSubRegFromChannel(i));
6923 }
6924 return DstReg;
6925}
6926
6928 MachineInstr &MI) const {
6929
6930 // If the pointer is store in VGPRs, then we need to move them to
6931 // SGPRs using v_readfirstlane. This is safe because we only select
6932 // loads with uniform pointers to SMRD instruction so we know the
6933 // pointer value is uniform.
6934 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6935 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6936 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6937 SBase->setReg(SGPR);
6938 }
6939 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6940 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6941 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6942 SOff->setReg(SGPR);
6943 }
6944}
6945
6947 unsigned Opc = Inst.getOpcode();
6948 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6949 if (OldSAddrIdx < 0)
6950 return false;
6951
6952 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6953
6954 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6955 if (NewOpc < 0)
6957 if (NewOpc < 0)
6958 return false;
6959
6960 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6961 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6962 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6963 return false;
6964
6965 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6966 if (NewVAddrIdx < 0)
6967 return false;
6968
6969 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6970
6971 // Check vaddr, it shall be zero or absent.
6972 MachineInstr *VAddrDef = nullptr;
6973 if (OldVAddrIdx >= 0) {
6974 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6975 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6976 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6977 !VAddrDef->getOperand(1).isImm() ||
6978 VAddrDef->getOperand(1).getImm() != 0)
6979 return false;
6980 }
6981
6982 const MCInstrDesc &NewDesc = get(NewOpc);
6983 Inst.setDesc(NewDesc);
6984
6985 // Callers expect iterator to be valid after this call, so modify the
6986 // instruction in place.
6987 if (OldVAddrIdx == NewVAddrIdx) {
6988 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6989 // Clear use list from the old vaddr holding a zero register.
6990 MRI.removeRegOperandFromUseList(&NewVAddr);
6991 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6992 Inst.removeOperand(OldSAddrIdx);
6993 // Update the use list with the pointer we have just moved from vaddr to
6994 // saddr position. Otherwise new vaddr will be missing from the use list.
6995 MRI.removeRegOperandFromUseList(&NewVAddr);
6996 MRI.addRegOperandToUseList(&NewVAddr);
6997 } else {
6998 assert(OldSAddrIdx == NewVAddrIdx);
6999
7000 if (OldVAddrIdx >= 0) {
7001 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
7002 AMDGPU::OpName::vdst_in);
7003
7004 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
7005 // it asserts. Untie the operands for now and retie them afterwards.
7006 if (NewVDstIn != -1) {
7007 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
7008 Inst.untieRegOperand(OldVDstIn);
7009 }
7010
7011 Inst.removeOperand(OldVAddrIdx);
7012
7013 if (NewVDstIn != -1) {
7014 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
7015 Inst.tieOperands(NewVDst, NewVDstIn);
7016 }
7017 }
7018 }
7019
7020 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
7021 VAddrDef->eraseFromParent();
7022
7023 return true;
7024}
7025
7026// FIXME: Remove this when SelectionDAG is obsoleted.
7028 MachineInstr &MI) const {
7029 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
7030 return;
7031
7032 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
7033 // thinks they are uniform, so a readfirstlane should be valid.
7034 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
7035 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
7036 return;
7037
7039 return;
7040
7041 const TargetRegisterClass *DeclaredRC =
7042 getRegClass(MI.getDesc(), SAddr->getOperandNo());
7043
7044 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
7045 SAddr->setReg(ToSGPR);
7046}
7047
7050 const TargetRegisterClass *DstRC,
7053 const DebugLoc &DL) const {
7054 Register OpReg = Op.getReg();
7055 unsigned OpSubReg = Op.getSubReg();
7056
7057 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7058 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
7059
7060 // Check if operand is already the correct register class.
7061 if (DstRC == OpRC)
7062 return;
7063
7064 Register DstReg = MRI.createVirtualRegister(DstRC);
7065 auto Copy =
7066 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
7067 Op.setReg(DstReg);
7068
7069 MachineInstr *Def = MRI.getVRegDef(OpReg);
7070 if (!Def)
7071 return;
7072
7073 // Try to eliminate the copy if it is copying an immediate value.
7074 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7075 foldImmediate(*Copy, *Def, OpReg, &MRI);
7076
7077 bool ImpDef = Def->isImplicitDef();
7078 while (!ImpDef && Def && Def->isCopy()) {
7079 if (Def->getOperand(1).getReg().isPhysical())
7080 break;
7081 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
7082 ImpDef = Def && Def->isImplicitDef();
7083 }
7084 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
7085 !ImpDef)
7086 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
7087}
7088
7089// Emit the actual waterfall loop, executing the wrapped instruction for each
7090// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7091// iteration, in the worst case we execute 64 (once per lane).
7094 MachineBasicBlock &BodyBB, const DebugLoc &DL,
7095 ArrayRef<MachineOperand *> ScalarOps, ArrayRef<Register> PhySGPRs = {}) {
7096 MachineFunction &MF = *LoopBB.getParent();
7098 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7100 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7101
7103 Register CondReg;
7104 for (auto [Idx, ScalarOp] : enumerate(ScalarOps)) {
7105 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7106 unsigned NumSubRegs = RegSize / 32;
7107 Register VScalarOp = ScalarOp->getReg();
7108
7109 const TargetRegisterClass *RFLSrcRC =
7110 TII.getRegClass(TII.get(AMDGPU::V_READFIRSTLANE_B32), 1);
7111
7112 if (NumSubRegs == 1) {
7113 const TargetRegisterClass *VScalarOpRC = MRI.getRegClass(VScalarOp);
7114 if (const TargetRegisterClass *Common =
7115 TRI->getCommonSubClass(VScalarOpRC, RFLSrcRC);
7116 Common != VScalarOpRC) {
7117 Register VRReg = MRI.createVirtualRegister(Common);
7118 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::COPY), VRReg).addReg(VScalarOp);
7119 VScalarOp = VRReg;
7120 }
7121 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7122
7123 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7124 .addReg(VScalarOp);
7125
7126 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7127
7128 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7129 .addReg(CurReg)
7130 .addReg(VScalarOp);
7131
7132 // Combine the comparison results with AND.
7133 if (!CondReg) // First.
7134 CondReg = NewCondReg;
7135 else { // If not the first, we create an AND.
7136 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7137 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7138 .addReg(CondReg)
7139 .addReg(NewCondReg);
7140 CondReg = AndReg;
7141 }
7142
7143 // Update ScalarOp operand to use the SGPR ScalarOp.
7144 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7145 ScalarOp->setReg(CurReg);
7146 else {
7147 // Insert into the same block of use
7148 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7149 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7150 .addReg(CurReg);
7151 ScalarOp->setReg(PhySGPRs[Idx]);
7152 }
7153 ScalarOp->setIsKill();
7154 } else {
7155 SmallVector<Register, 8> ReadlanePieces;
7156 RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7157 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7158 "Unhandled register size");
7159
7160 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7161 Register CurRegLo =
7162 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7163 Register CurRegHi =
7164 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7165
7166 // Read the next variant <- also loop target.
7167 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7168 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7169
7170 // Read the next variant <- also loop target.
7171 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7172 .addReg(VScalarOp, VScalarOpUndef,
7173 TRI->getSubRegFromChannel(Idx + 1));
7174
7175 ReadlanePieces.push_back(CurRegLo);
7176 ReadlanePieces.push_back(CurRegHi);
7177
7178 // Comparison is to be done as 64-bit.
7179 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7180 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7181 .addReg(CurRegLo)
7182 .addImm(AMDGPU::sub0)
7183 .addReg(CurRegHi)
7184 .addImm(AMDGPU::sub1);
7185
7186 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7187 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7188 NewCondReg)
7189 .addReg(CurReg);
7190 if (NumSubRegs <= 2)
7191 Cmp.addReg(VScalarOp);
7192 else
7193 Cmp.addReg(VScalarOp, VScalarOpUndef,
7194 TRI->getSubRegFromChannel(Idx, 2));
7195
7196 // Combine the comparison results with AND.
7197 if (!CondReg) // First.
7198 CondReg = NewCondReg;
7199 else { // If not the first, we create an AND.
7200 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7201 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7202 .addReg(CondReg)
7203 .addReg(NewCondReg);
7204 CondReg = AndReg;
7205 }
7206 } // End for loop.
7207
7208 const auto *SScalarOpRC =
7209 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7210 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7211
7212 // Build scalar ScalarOp.
7213 auto Merge =
7214 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7215 unsigned Channel = 0;
7216 for (Register Piece : ReadlanePieces) {
7217 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7218 }
7219
7220 // Update ScalarOp operand to use the SGPR ScalarOp.
7221 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7222 ScalarOp->setReg(SScalarOp);
7223 else {
7224 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7225 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7226 .addReg(SScalarOp);
7227 ScalarOp->setReg(PhySGPRs[Idx]);
7228 }
7229 ScalarOp->setIsKill();
7230 }
7231 }
7232
7233 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7234 MRI.setSimpleHint(SaveExec, CondReg);
7235
7236 // Update EXEC to matching lanes, saving original to SaveExec.
7237 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7238 .addReg(CondReg, RegState::Kill);
7239
7240 // The original instruction is here; we insert the terminators after it.
7241 I = BodyBB.end();
7242
7243 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7244 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7245 .addReg(LMC.ExecReg)
7246 .addReg(SaveExec);
7247
7248 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7249}
7250
7251// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7252// with SGPRs by iterating over all unique values across all lanes.
7253// Returns the loop basic block that now contains \p MI.
7254static MachineBasicBlock *
7258 MachineBasicBlock::iterator Begin = nullptr,
7259 MachineBasicBlock::iterator End = nullptr,
7260 ArrayRef<Register> PhySGPRs = {}) {
7261 assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) &&
7262 "Physical SGPRs must be empty or match the number of scalar operands");
7263 MachineBasicBlock &MBB = *MI.getParent();
7264 MachineFunction &MF = *MBB.getParent();
7266 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7267 MachineRegisterInfo &MRI = MF.getRegInfo();
7268 if (!Begin.isValid())
7269 Begin = &MI;
7270 if (!End.isValid()) {
7271 End = &MI;
7272 ++End;
7273 }
7274 const DebugLoc &DL = MI.getDebugLoc();
7276 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7277
7278 // Save SCC. Waterfall Loop may overwrite SCC.
7279 Register SaveSCCReg;
7280
7281 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7282 // rather than unlimited scan everywhere
7283 bool SCCNotDead =
7284 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7285 std::numeric_limits<unsigned>::max()) !=
7287 if (SCCNotDead) {
7288 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7289 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7290 .addImm(1)
7291 .addImm(0);
7292 }
7293
7294 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7295
7296 // Save the EXEC mask
7297 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7298
7299 // Killed uses in the instruction we are waterfalling around will be
7300 // incorrect due to the added control-flow.
7302 ++AfterMI;
7303 for (auto I = Begin; I != AfterMI; I++) {
7304 for (auto &MO : I->all_uses())
7305 MRI.clearKillFlags(MO.getReg());
7306 }
7307
7308 // To insert the loop we need to split the block. Move everything after this
7309 // point to a new block, and insert a new empty block between the two.
7312 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7314 ++MBBI;
7315
7316 MF.insert(MBBI, LoopBB);
7317 MF.insert(MBBI, BodyBB);
7318 MF.insert(MBBI, RemainderBB);
7319
7320 LoopBB->addSuccessor(BodyBB);
7321 BodyBB->addSuccessor(LoopBB);
7322 BodyBB->addSuccessor(RemainderBB);
7323
7324 // Move Begin to MI to the BodyBB, and the remainder of the block to
7325 // RemainderBB.
7326 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7327 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7328 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7329
7330 MBB.addSuccessor(LoopBB);
7331
7332 // Update dominators. We know that MBB immediately dominates LoopBB, that
7333 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7334 // RemainderBB. RemainderBB immediately dominates all of the successors
7335 // transferred to it from MBB that MBB used to properly dominate.
7336 if (MDT) {
7337 MDT->addNewBlock(LoopBB, &MBB);
7338 MDT->addNewBlock(BodyBB, LoopBB);
7339 MDT->addNewBlock(RemainderBB, BodyBB);
7340 for (auto &Succ : RemainderBB->successors()) {
7341 if (MDT->properlyDominates(&MBB, Succ)) {
7342 MDT->changeImmediateDominator(Succ, RemainderBB);
7343 }
7344 }
7345 }
7346
7347 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps,
7348 PhySGPRs);
7349
7350 MachineBasicBlock::iterator First = RemainderBB->begin();
7351 // Restore SCC
7352 if (SCCNotDead) {
7353 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7354 .addReg(SaveSCCReg, RegState::Kill)
7355 .addImm(0);
7356 }
7357
7358 // Restore the EXEC mask
7359 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7360 .addReg(SaveExec);
7361 return BodyBB;
7362}
7363
7364// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7365static std::tuple<unsigned, unsigned>
7367 MachineBasicBlock &MBB = *MI.getParent();
7368 MachineFunction &MF = *MBB.getParent();
7369 MachineRegisterInfo &MRI = MF.getRegInfo();
7370
7371 // Extract the ptr from the resource descriptor.
7372 unsigned RsrcPtr =
7373 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7374 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7375
7376 // Create an empty resource descriptor
7377 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7378 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7379 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7380 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7381 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7382
7383 // Zero64 = 0
7384 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7385 .addImm(0);
7386
7387 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7388 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7389 .addImm(Lo_32(RsrcDataFormat));
7390
7391 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7392 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7393 .addImm(Hi_32(RsrcDataFormat));
7394
7395 // NewSRsrc = {Zero64, SRsrcFormat}
7396 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7397 .addReg(Zero64)
7398 .addImm(AMDGPU::sub0_sub1)
7399 .addReg(SRsrcFormatLo)
7400 .addImm(AMDGPU::sub2)
7401 .addReg(SRsrcFormatHi)
7402 .addImm(AMDGPU::sub3);
7403
7404 return std::tuple(RsrcPtr, NewSRsrc);
7405}
7406
7409 MachineDominatorTree *MDT) const {
7410 MachineFunction &MF = *MI.getMF();
7411 MachineRegisterInfo &MRI = MF.getRegInfo();
7412 MachineBasicBlock *CreatedBB = nullptr;
7413
7414 // Legalize VOP2
7415 if (isVOP2(MI) || isVOPC(MI)) {
7417 return CreatedBB;
7418 }
7419
7420 // Legalize VOP3
7421 if (isVOP3(MI)) {
7423 return CreatedBB;
7424 }
7425
7426 // Legalize SMRD
7427 if (isSMRD(MI)) {
7429 return CreatedBB;
7430 }
7431
7432 // Legalize FLAT
7433 if (isFLAT(MI)) {
7435 return CreatedBB;
7436 }
7437
7438 // Legalize PHI
7439 // The register class of the operands must be the same type as the register
7440 // class of the output.
7441 if (MI.getOpcode() == AMDGPU::PHI) {
7442 const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
7443 assert(!RI.isSGPRClass(VRC));
7444
7445 // Update all the operands so they have the same type.
7446 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7447 MachineOperand &Op = MI.getOperand(I);
7448 if (!Op.isReg() || !Op.getReg().isVirtual())
7449 continue;
7450
7451 // MI is a PHI instruction.
7452 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7454
7455 // Avoid creating no-op copies with the same src and dst reg class. These
7456 // confuse some of the machine passes.
7457 legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
7458 }
7459 }
7460
7461 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7462 // VGPR dest type and SGPR sources, insert copies so all operands are
7463 // VGPRs. This seems to help operand folding / the register coalescer.
7464 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7465 MachineBasicBlock *MBB = MI.getParent();
7466 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7467 if (RI.hasVGPRs(DstRC)) {
7468 // Update all the operands so they are VGPR register classes. These may
7469 // not be the same register class because REG_SEQUENCE supports mixing
7470 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7471 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7472 MachineOperand &Op = MI.getOperand(I);
7473 if (!Op.isReg() || !Op.getReg().isVirtual())
7474 continue;
7475
7476 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7477 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7478 if (VRC == OpRC)
7479 continue;
7480
7481 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7482 Op.setIsKill();
7483 }
7484 }
7485
7486 return CreatedBB;
7487 }
7488
7489 // Legalize INSERT_SUBREG
7490 // src0 must have the same register class as dst
7491 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7492 Register Dst = MI.getOperand(0).getReg();
7493 Register Src0 = MI.getOperand(1).getReg();
7494 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7495 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7496 if (DstRC != Src0RC) {
7497 MachineBasicBlock *MBB = MI.getParent();
7498 MachineOperand &Op = MI.getOperand(1);
7499 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7500 }
7501 return CreatedBB;
7502 }
7503
7504 // Legalize SI_INIT_M0
7505 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7506 MachineOperand &Src = MI.getOperand(0);
7507 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7508 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7509 return CreatedBB;
7510 }
7511
7512 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7513 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7514 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7515 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7516 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7517 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7518 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7519 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7520 MachineOperand &Src = MI.getOperand(1);
7521 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7522 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7523 return CreatedBB;
7524 }
7525
7526 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7527 //
7528 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7529 // scratch memory access. In both cases, the legalization never involves
7530 // conversion to the addr64 form.
7532 (isMUBUF(MI) || isMTBUF(MI)))) {
7533 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7534 ? AMDGPU::OpName::rsrc
7535 : AMDGPU::OpName::srsrc;
7536 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7537 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7538 CreatedBB = generateWaterFallLoop(*this, MI, {SRsrc}, MDT);
7539
7540 AMDGPU::OpName SampOpName =
7541 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7542 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7543 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7544 CreatedBB = generateWaterFallLoop(*this, MI, {SSamp}, MDT);
7545
7546 return CreatedBB;
7547 }
7548
7549 // Legalize SI_CALL
7550 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7551 MachineOperand *Dest = &MI.getOperand(0);
7552 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7553 createWaterFallForSiCall(&MI, MDT, {Dest});
7554 }
7555 }
7556
7557 // Legalize s_sleep_var.
7558 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7559 const DebugLoc &DL = MI.getDebugLoc();
7560 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7561 int Src0Idx =
7562 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7563 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7564 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7565 .add(Src0);
7566 Src0.ChangeToRegister(Reg, false);
7567 return nullptr;
7568 }
7569
7570 // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
7571 // operands are scalar.
7572 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
7573 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
7574 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
7575 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
7576 for (MachineOperand &Src : MI.explicit_operands()) {
7577 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7578 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7579 }
7580 return CreatedBB;
7581 }
7582
7583 // Legalize MUBUF instructions.
7584 bool isSoffsetLegal = true;
7585 int SoffsetIdx =
7586 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7587 if (SoffsetIdx != -1) {
7588 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7589 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7590 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7591 isSoffsetLegal = false;
7592 }
7593 }
7594
7595 bool isRsrcLegal = true;
7596 int RsrcIdx =
7597 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7598 if (RsrcIdx != -1) {
7599 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7600 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7601 isRsrcLegal = false;
7602 }
7603
7604 // The operands are legal.
7605 if (isRsrcLegal && isSoffsetLegal)
7606 return CreatedBB;
7607
7608 if (!isRsrcLegal) {
7609 // Legalize a VGPR Rsrc
7610 //
7611 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7612 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7613 // a zero-value SRsrc.
7614 //
7615 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7616 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7617 // above.
7618 //
7619 // Otherwise we are on non-ADDR64 hardware, and/or we have
7620 // idxen/offen/bothen and we fall back to a waterfall loop.
7621
7622 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7623 MachineBasicBlock &MBB = *MI.getParent();
7624
7625 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7626 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7627 // This is already an ADDR64 instruction so we need to add the pointer
7628 // extracted from the resource descriptor to the current value of VAddr.
7629 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7630 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7631 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7632
7633 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7634 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7635 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7636
7637 unsigned RsrcPtr, NewSRsrc;
7638 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7639
7640 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7641 const DebugLoc &DL = MI.getDebugLoc();
7642 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7643 .addDef(CondReg0)
7644 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7645 .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
7646 .addImm(0);
7647
7648 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7649 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7650 .addDef(CondReg1, RegState::Dead)
7651 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7652 .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
7653 .addReg(CondReg0, RegState::Kill)
7654 .addImm(0);
7655
7656 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7657 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7658 .addReg(NewVAddrLo)
7659 .addImm(AMDGPU::sub0)
7660 .addReg(NewVAddrHi)
7661 .addImm(AMDGPU::sub1);
7662
7663 VAddr->setReg(NewVAddr);
7664 Rsrc->setReg(NewSRsrc);
7665 } else if (!VAddr && ST.hasAddr64()) {
7666 // This instructions is the _OFFSET variant, so we need to convert it to
7667 // ADDR64.
7668 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7669 "FIXME: Need to emit flat atomics here");
7670
7671 unsigned RsrcPtr, NewSRsrc;
7672 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7673
7674 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7675 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7676 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7677 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7678 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7679
7680 // Atomics with return have an additional tied operand and are
7681 // missing some of the special bits.
7682 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7683 MachineInstr *Addr64;
7684
7685 if (!VDataIn) {
7686 // Regular buffer load / store.
7688 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7689 .add(*VData)
7690 .addReg(NewVAddr)
7691 .addReg(NewSRsrc)
7692 .add(*SOffset)
7693 .add(*Offset);
7694
7695 if (const MachineOperand *CPol =
7696 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7697 MIB.addImm(CPol->getImm());
7698 }
7699
7700 if (const MachineOperand *TFE =
7701 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7702 MIB.addImm(TFE->getImm());
7703 }
7704
7705 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7706
7707 MIB.cloneMemRefs(MI);
7708 Addr64 = MIB;
7709 } else {
7710 // Atomics with return.
7711 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7712 .add(*VData)
7713 .add(*VDataIn)
7714 .addReg(NewVAddr)
7715 .addReg(NewSRsrc)
7716 .add(*SOffset)
7717 .add(*Offset)
7718 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7719 .cloneMemRefs(MI);
7720 }
7721
7722 MI.removeFromParent();
7723
7724 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7725 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7726 NewVAddr)
7727 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7728 .addImm(AMDGPU::sub0)
7729 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7730 .addImm(AMDGPU::sub1);
7731 } else {
7732 // Legalize a VGPR Rsrc and soffset together.
7733 if (!isSoffsetLegal) {
7734 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7735 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc, Soffset}, MDT);
7736 return CreatedBB;
7737 }
7738 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc}, MDT);
7739 return CreatedBB;
7740 }
7741 }
7742
7743 // Legalize a VGPR soffset.
7744 if (!isSoffsetLegal) {
7745 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7746 CreatedBB = generateWaterFallLoop(*this, MI, {Soffset}, MDT);
7747 return CreatedBB;
7748 }
7749 return CreatedBB;
7750}
7751
7753 InstrList.insert(MI);
7754 // Add MBUF instructiosn to deferred list.
7755 int RsrcIdx =
7756 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7757 if (RsrcIdx != -1) {
7758 DeferredList.insert(MI);
7759 }
7760}
7761
7763 return DeferredList.contains(MI);
7764}
7765
7766// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7767// lowering (change sgpr to vgpr).
7768// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7769// size. Need to legalize the size of the operands during the vgpr lowering
7770// chain. This can be removed after we have sgpr16 in place
7772 MachineRegisterInfo &MRI) const {
7773 if (!ST.useRealTrue16Insts())
7774 return;
7775
7776 unsigned Opcode = MI.getOpcode();
7777 MachineBasicBlock *MBB = MI.getParent();
7778 // Legalize operands and check for size mismatch
7779 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7780 OpIdx >= get(Opcode).getNumOperands() ||
7781 get(Opcode).operands()[OpIdx].RegClass == -1)
7782 return;
7783
7784 MachineOperand &Op = MI.getOperand(OpIdx);
7785 if (!Op.isReg() || !Op.getReg().isVirtual())
7786 return;
7787
7788 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7789 if (!RI.isVGPRClass(CurrRC))
7790 return;
7791
7792 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7793 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7794 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7795 Op.setSubReg(AMDGPU::lo16);
7796 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7797 const DebugLoc &DL = MI.getDebugLoc();
7798 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7799 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7800 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7801 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7802 .addReg(Op.getReg())
7803 .addImm(AMDGPU::lo16)
7804 .addReg(Undef)
7805 .addImm(AMDGPU::hi16);
7806 Op.setReg(NewDstReg);
7807 }
7808}
7810 MachineRegisterInfo &MRI) const {
7811 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7813}
7814
7818 ArrayRef<Register> PhySGPRs) const {
7819 assert(MI->getOpcode() == AMDGPU::SI_CALL_ISEL &&
7820 "This only handle waterfall for SI_CALL_ISEL");
7821 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7822 // following copies, we also need to move copies from and to physical
7823 // registers into the loop block.
7824 // Also move the copies to physical registers into the loop block
7825 MachineBasicBlock &MBB = *MI->getParent();
7827 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
7828 --Start;
7830 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
7831 ++End;
7832
7833 // Also include following copies of the return value
7834 ++End;
7835 while (End != MBB.end() && End->isCopy() &&
7836 MI->definesRegister(End->getOperand(1).getReg(), &RI))
7837 ++End;
7838
7839 generateWaterFallLoop(*this, *MI, ScalarOps, MDT, Start, End, PhySGPRs);
7840}
7841
7843 MachineDominatorTree *MDT) const {
7845 DenseMap<MachineInstr *, bool> V2SPhyCopiesToErase;
7846 while (!Worklist.empty()) {
7847 MachineInstr &Inst = *Worklist.top();
7848 Worklist.erase_top();
7849 // Skip MachineInstr in the deferred list.
7850 if (Worklist.isDeferred(&Inst))
7851 continue;
7852 moveToVALUImpl(Worklist, MDT, Inst, WaterFalls, V2SPhyCopiesToErase);
7853 }
7854
7855 // Deferred list of instructions will be processed once
7856 // all the MachineInstr in the worklist are done.
7857 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7858 moveToVALUImpl(Worklist, MDT, *Inst, WaterFalls, V2SPhyCopiesToErase);
7859 assert(Worklist.empty() &&
7860 "Deferred MachineInstr are not supposed to re-populate worklist");
7861 }
7862
7863 for (std::pair<MachineInstr *, V2PhysSCopyInfo> &Entry : WaterFalls) {
7864 if (Entry.first->getOpcode() == AMDGPU::SI_CALL_ISEL)
7865 createWaterFallForSiCall(Entry.first, MDT, Entry.second.MOs,
7866 Entry.second.SGPRs);
7867 }
7868
7869 for (std::pair<MachineInstr *, bool> Entry : V2SPhyCopiesToErase)
7870 if (Entry.second)
7871 Entry.first->eraseFromParent();
7872}
7874 MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const {
7875 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7876 // hope for the best.
7877 const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI, DstReg);
7878 ArrayRef<int16_t> SubRegIndices = RI.getRegSplitParts(DstRC, 4);
7879 if (SubRegIndices.size() <= 1) {
7880 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7881 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7882 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7883 .add(Inst.getOperand(1));
7884 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
7885 DstReg)
7886 .addReg(NewDst);
7887 } else {
7889 for (int16_t Indice : SubRegIndices) {
7890 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7891 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7892 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7893 .addReg(Inst.getOperand(1).getReg(), {}, Indice);
7894
7895 DstRegs.push_back(NewDst);
7896 }
7898 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7899 get(AMDGPU::REG_SEQUENCE), DstReg);
7900 for (unsigned i = 0; i < SubRegIndices.size(); ++i) {
7901 MIB.addReg(DstRegs[i]);
7902 MIB.addImm(RI.getSubRegFromChannel(i));
7903 }
7904 }
7905}
7906
7908 SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst,
7911 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7912 if (DstReg == AMDGPU::M0) {
7913 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7914 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7915 return;
7916 }
7917 Register SrcReg = Inst.getOperand(1).getReg();
7920 // Only search current block since phyreg's def & use cannot cross
7921 // blocks when MF.NoPhi = false.
7922 while (++I != E) {
7923 // For SI_CALL_ISEL users, replace the phys SGPR with the VGPR source
7924 // and record the operand for later waterfall loop generation.
7925 if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) {
7926 MachineInstr *UseMI = &*I;
7927 for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) {
7928 if (UseMI->getOperand(i).isReg() &&
7929 UseMI->getOperand(i).getReg() == DstReg) {
7930 MachineOperand *MO = &UseMI->getOperand(i);
7931 MO->setReg(SrcReg);
7932 V2PhysSCopyInfo &V2SCopyInfo = WaterFalls[UseMI];
7933 V2SCopyInfo.MOs.push_back(MO);
7934 V2SCopyInfo.SGPRs.push_back(DstReg);
7935 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7936 }
7937 }
7938 } else if (I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG &&
7939 I->getOperand(0).isReg() &&
7940 I->getOperand(0).getReg() == DstReg) {
7941 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7942 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7943 } else if (I->readsRegister(DstReg, &RI)) {
7944 // COPY cannot be erased if other type of inst uses it.
7945 V2SPhyCopiesToErase[&Inst] = false;
7946 }
7947 if (I->findRegisterDefOperand(DstReg, &RI))
7948 break;
7949 }
7950}
7951
7953 SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst,
7955 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7956
7958 if (!MBB)
7959 return;
7960 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7961 unsigned Opcode = Inst.getOpcode();
7962 unsigned NewOpcode = getVALUOp(Inst);
7963 const DebugLoc &DL = Inst.getDebugLoc();
7964
7965 // Handle some special cases
7966 switch (Opcode) {
7967 default:
7968 break;
7969 case AMDGPU::S_ADD_I32:
7970 case AMDGPU::S_SUB_I32: {
7971 // FIXME: The u32 versions currently selected use the carry.
7972 bool Changed;
7973 MachineBasicBlock *CreatedBBTmp = nullptr;
7974 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7975 if (Changed)
7976 return;
7977
7978 // Default handling
7979 break;
7980 }
7981
7982 case AMDGPU::S_MUL_U64:
7983 if (ST.hasVMulU64Inst()) {
7984 NewOpcode = AMDGPU::V_MUL_U64_e64;
7985 break;
7986 }
7987 // Split s_mul_u64 in 32-bit vector multiplications.
7988 splitScalarSMulU64(Worklist, Inst, MDT);
7989 Inst.eraseFromParent();
7990 return;
7991
7992 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7993 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7994 // This is a special case of s_mul_u64 where all the operands are either
7995 // zero extended or sign extended.
7996 splitScalarSMulPseudo(Worklist, Inst, MDT);
7997 Inst.eraseFromParent();
7998 return;
7999
8000 case AMDGPU::S_AND_B64:
8001 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
8002 Inst.eraseFromParent();
8003 return;
8004
8005 case AMDGPU::S_OR_B64:
8006 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
8007 Inst.eraseFromParent();
8008 return;
8009
8010 case AMDGPU::S_XOR_B64:
8011 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
8012 Inst.eraseFromParent();
8013 return;
8014
8015 case AMDGPU::S_NAND_B64:
8016 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
8017 Inst.eraseFromParent();
8018 return;
8019
8020 case AMDGPU::S_NOR_B64:
8021 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
8022 Inst.eraseFromParent();
8023 return;
8024
8025 case AMDGPU::S_XNOR_B64:
8026 if (ST.hasDLInsts())
8027 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
8028 else
8029 splitScalar64BitXnor(Worklist, Inst, MDT);
8030 Inst.eraseFromParent();
8031 return;
8032
8033 case AMDGPU::S_ANDN2_B64:
8034 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
8035 Inst.eraseFromParent();
8036 return;
8037
8038 case AMDGPU::S_ORN2_B64:
8039 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
8040 Inst.eraseFromParent();
8041 return;
8042
8043 case AMDGPU::S_BREV_B64:
8044 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
8045 Inst.eraseFromParent();
8046 return;
8047
8048 case AMDGPU::S_NOT_B64:
8049 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
8050 Inst.eraseFromParent();
8051 return;
8052
8053 case AMDGPU::S_BCNT1_I32_B64:
8054 splitScalar64BitBCNT(Worklist, Inst);
8055 Inst.eraseFromParent();
8056 return;
8057
8058 case AMDGPU::S_BFE_I64:
8059 splitScalar64BitBFE(Worklist, Inst);
8060 Inst.eraseFromParent();
8061 return;
8062
8063 case AMDGPU::S_FLBIT_I32_B64:
8064 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
8065 Inst.eraseFromParent();
8066 return;
8067 case AMDGPU::S_FF1_I32_B64:
8068 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
8069 Inst.eraseFromParent();
8070 return;
8071
8072 case AMDGPU::S_LSHL_B32:
8073 if (ST.hasOnlyRevVALUShifts()) {
8074 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
8075 swapOperands(Inst);
8076 }
8077 break;
8078 case AMDGPU::S_ASHR_I32:
8079 if (ST.hasOnlyRevVALUShifts()) {
8080 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
8081 swapOperands(Inst);
8082 }
8083 break;
8084 case AMDGPU::S_LSHR_B32:
8085 if (ST.hasOnlyRevVALUShifts()) {
8086 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
8087 swapOperands(Inst);
8088 }
8089 break;
8090 case AMDGPU::S_LSHL_B64:
8091 if (ST.hasOnlyRevVALUShifts()) {
8092 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
8093 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
8094 : AMDGPU::V_LSHLREV_B64_e64;
8095 swapOperands(Inst);
8096 }
8097 break;
8098 case AMDGPU::S_ASHR_I64:
8099 if (ST.hasOnlyRevVALUShifts()) {
8100 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
8101 swapOperands(Inst);
8102 }
8103 break;
8104 case AMDGPU::S_LSHR_B64:
8105 if (ST.hasOnlyRevVALUShifts()) {
8106 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
8107 swapOperands(Inst);
8108 }
8109 break;
8110
8111 case AMDGPU::S_ABS_I32:
8112 lowerScalarAbs(Worklist, Inst);
8113 Inst.eraseFromParent();
8114 return;
8115
8116 case AMDGPU::S_ABSDIFF_I32:
8117 lowerScalarAbsDiff(Worklist, Inst);
8118 Inst.eraseFromParent();
8119 return;
8120
8121 case AMDGPU::S_CBRANCH_SCC0:
8122 case AMDGPU::S_CBRANCH_SCC1: {
8123 // Clear unused bits of vcc
8124 Register CondReg = Inst.getOperand(1).getReg();
8125 bool IsSCC = CondReg == AMDGPU::SCC;
8127 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
8128 .addReg(LMC.ExecReg)
8129 .addReg(IsSCC ? LMC.VccReg : CondReg);
8130 Inst.removeOperand(1);
8131 } break;
8132
8133 case AMDGPU::S_BFE_U64:
8134 case AMDGPU::S_BFM_B64:
8135 llvm_unreachable("Moving this op to VALU not implemented");
8136
8137 case AMDGPU::S_PACK_LL_B32_B16:
8138 case AMDGPU::S_PACK_LH_B32_B16:
8139 case AMDGPU::S_PACK_HL_B32_B16:
8140 case AMDGPU::S_PACK_HH_B32_B16:
8141 movePackToVALU(Worklist, MRI, Inst);
8142 Inst.eraseFromParent();
8143 return;
8144
8145 case AMDGPU::S_XNOR_B32:
8146 lowerScalarXnor(Worklist, Inst);
8147 Inst.eraseFromParent();
8148 return;
8149
8150 case AMDGPU::S_NAND_B32:
8151 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
8152 Inst.eraseFromParent();
8153 return;
8154
8155 case AMDGPU::S_NOR_B32:
8156 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
8157 Inst.eraseFromParent();
8158 return;
8159
8160 case AMDGPU::S_ANDN2_B32:
8161 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
8162 Inst.eraseFromParent();
8163 return;
8164
8165 case AMDGPU::S_ORN2_B32:
8166 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
8167 Inst.eraseFromParent();
8168 return;
8169
8170 // TODO: remove as soon as everything is ready
8171 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8172 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8173 // can only be selected from the uniform SDNode.
8174 case AMDGPU::S_ADD_CO_PSEUDO:
8175 case AMDGPU::S_SUB_CO_PSEUDO: {
8176 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8177 ? AMDGPU::V_ADDC_U32_e64
8178 : AMDGPU::V_SUBB_U32_e64;
8179 const auto *CarryRC = RI.getWaveMaskRegClass();
8180
8181 Register CarryInReg = Inst.getOperand(4).getReg();
8182 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
8183 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
8184 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
8185 .addReg(CarryInReg);
8186 }
8187
8188 Register CarryOutReg = Inst.getOperand(1).getReg();
8189
8190 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8191 MRI.getRegClass(Inst.getOperand(0).getReg())));
8192 MachineInstr *CarryOp =
8193 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8194 .addReg(CarryOutReg, RegState::Define)
8195 .add(Inst.getOperand(2))
8196 .add(Inst.getOperand(3))
8197 .addReg(CarryInReg)
8198 .addImm(0);
8199 legalizeOperands(*CarryOp);
8200 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8201 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8202 Inst.eraseFromParent();
8203 }
8204 return;
8205 case AMDGPU::S_UADDO_PSEUDO:
8206 case AMDGPU::S_USUBO_PSEUDO: {
8207 MachineOperand &Dest0 = Inst.getOperand(0);
8208 MachineOperand &Dest1 = Inst.getOperand(1);
8209 MachineOperand &Src0 = Inst.getOperand(2);
8210 MachineOperand &Src1 = Inst.getOperand(3);
8211
8212 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8213 ? AMDGPU::V_ADD_CO_U32_e64
8214 : AMDGPU::V_SUB_CO_U32_e64;
8215 const TargetRegisterClass *NewRC =
8216 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8217 Register DestReg = MRI.createVirtualRegister(NewRC);
8218 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8219 .addReg(Dest1.getReg(), RegState::Define)
8220 .add(Src0)
8221 .add(Src1)
8222 .addImm(0); // clamp bit
8223
8224 legalizeOperands(*NewInstr, MDT);
8225 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8226 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8227 Inst.eraseFromParent();
8228 }
8229 return;
8230 case AMDGPU::S_LSHL1_ADD_U32:
8231 case AMDGPU::S_LSHL2_ADD_U32:
8232 case AMDGPU::S_LSHL3_ADD_U32:
8233 case AMDGPU::S_LSHL4_ADD_U32: {
8234 MachineOperand &Dest = Inst.getOperand(0);
8235 MachineOperand &Src0 = Inst.getOperand(1);
8236 MachineOperand &Src1 = Inst.getOperand(2);
8237 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8238 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8239 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8240 : 4);
8241
8242 const TargetRegisterClass *NewRC =
8243 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8244 Register DestReg = MRI.createVirtualRegister(NewRC);
8245 MachineInstr *NewInstr =
8246 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8247 .add(Src0)
8248 .addImm(ShiftAmt)
8249 .add(Src1);
8250
8251 legalizeOperands(*NewInstr, MDT);
8252 MRI.replaceRegWith(Dest.getReg(), DestReg);
8253 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8254 Inst.eraseFromParent();
8255 }
8256 return;
8257 case AMDGPU::S_CSELECT_B32:
8258 case AMDGPU::S_CSELECT_B64:
8259 lowerSelect(Worklist, Inst, MDT);
8260 Inst.eraseFromParent();
8261 return;
8262 case AMDGPU::S_CMP_EQ_I32:
8263 case AMDGPU::S_CMP_LG_I32:
8264 case AMDGPU::S_CMP_GT_I32:
8265 case AMDGPU::S_CMP_GE_I32:
8266 case AMDGPU::S_CMP_LT_I32:
8267 case AMDGPU::S_CMP_LE_I32:
8268 case AMDGPU::S_CMP_EQ_U32:
8269 case AMDGPU::S_CMP_LG_U32:
8270 case AMDGPU::S_CMP_GT_U32:
8271 case AMDGPU::S_CMP_GE_U32:
8272 case AMDGPU::S_CMP_LT_U32:
8273 case AMDGPU::S_CMP_LE_U32:
8274 case AMDGPU::S_CMP_EQ_U64:
8275 case AMDGPU::S_CMP_LG_U64:
8276 case AMDGPU::S_CMP_LT_F32:
8277 case AMDGPU::S_CMP_EQ_F32:
8278 case AMDGPU::S_CMP_LE_F32:
8279 case AMDGPU::S_CMP_GT_F32:
8280 case AMDGPU::S_CMP_LG_F32:
8281 case AMDGPU::S_CMP_GE_F32:
8282 case AMDGPU::S_CMP_O_F32:
8283 case AMDGPU::S_CMP_U_F32:
8284 case AMDGPU::S_CMP_NGE_F32:
8285 case AMDGPU::S_CMP_NLG_F32:
8286 case AMDGPU::S_CMP_NGT_F32:
8287 case AMDGPU::S_CMP_NLE_F32:
8288 case AMDGPU::S_CMP_NEQ_F32:
8289 case AMDGPU::S_CMP_NLT_F32: {
8290 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8291 auto NewInstr =
8292 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8293 .setMIFlags(Inst.getFlags());
8294 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8295 0) {
8296 NewInstr
8297 .addImm(0) // src0_modifiers
8298 .add(Inst.getOperand(0)) // src0
8299 .addImm(0) // src1_modifiers
8300 .add(Inst.getOperand(1)) // src1
8301 .addImm(0); // clamp
8302 } else {
8303 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8304 }
8305 legalizeOperands(*NewInstr, MDT);
8306 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8307 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8308 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8309 Inst.eraseFromParent();
8310 return;
8311 }
8312 case AMDGPU::S_CMP_LT_F16:
8313 case AMDGPU::S_CMP_EQ_F16:
8314 case AMDGPU::S_CMP_LE_F16:
8315 case AMDGPU::S_CMP_GT_F16:
8316 case AMDGPU::S_CMP_LG_F16:
8317 case AMDGPU::S_CMP_GE_F16:
8318 case AMDGPU::S_CMP_O_F16:
8319 case AMDGPU::S_CMP_U_F16:
8320 case AMDGPU::S_CMP_NGE_F16:
8321 case AMDGPU::S_CMP_NLG_F16:
8322 case AMDGPU::S_CMP_NGT_F16:
8323 case AMDGPU::S_CMP_NLE_F16:
8324 case AMDGPU::S_CMP_NEQ_F16:
8325 case AMDGPU::S_CMP_NLT_F16: {
8326 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8327 auto NewInstr =
8328 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8329 .setMIFlags(Inst.getFlags());
8330 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8331 NewInstr
8332 .addImm(0) // src0_modifiers
8333 .add(Inst.getOperand(0)) // src0
8334 .addImm(0) // src1_modifiers
8335 .add(Inst.getOperand(1)) // src1
8336 .addImm(0); // clamp
8337 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8338 NewInstr.addImm(0); // op_sel0
8339 } else {
8340 NewInstr
8341 .add(Inst.getOperand(0))
8342 .add(Inst.getOperand(1));
8343 }
8344 legalizeOperandsVALUt16(*NewInstr, MRI);
8345 legalizeOperands(*NewInstr, MDT);
8346 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8347 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8348 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8349 Inst.eraseFromParent();
8350 return;
8351 }
8352 case AMDGPU::S_CVT_HI_F32_F16: {
8353 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8354 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8355 if (ST.useRealTrue16Insts()) {
8356 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8357 .add(Inst.getOperand(1));
8358 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8359 .addImm(0) // src0_modifiers
8360 .addReg(TmpReg, {}, AMDGPU::hi16)
8361 .addImm(0) // clamp
8362 .addImm(0) // omod
8363 .addImm(0); // op_sel0
8364 } else {
8365 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8366 .addImm(16)
8367 .add(Inst.getOperand(1));
8368 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8369 .addImm(0) // src0_modifiers
8370 .addReg(TmpReg)
8371 .addImm(0) // clamp
8372 .addImm(0); // omod
8373 }
8374
8375 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8376 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8377 Inst.eraseFromParent();
8378 return;
8379 }
8380 case AMDGPU::S_MINIMUM_F32:
8381 case AMDGPU::S_MAXIMUM_F32: {
8382 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8383 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8384 .addImm(0) // src0_modifiers
8385 .add(Inst.getOperand(1))
8386 .addImm(0) // src1_modifiers
8387 .add(Inst.getOperand(2))
8388 .addImm(0) // clamp
8389 .addImm(0); // omod
8390 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8391
8392 legalizeOperands(*NewInstr, MDT);
8393 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8394 Inst.eraseFromParent();
8395 return;
8396 }
8397 case AMDGPU::S_MINIMUM_F16:
8398 case AMDGPU::S_MAXIMUM_F16: {
8399 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8400 ? &AMDGPU::VGPR_16RegClass
8401 : &AMDGPU::VGPR_32RegClass);
8402 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8403 .addImm(0) // src0_modifiers
8404 .add(Inst.getOperand(1))
8405 .addImm(0) // src1_modifiers
8406 .add(Inst.getOperand(2))
8407 .addImm(0) // clamp
8408 .addImm(0) // omod
8409 .addImm(0); // opsel0
8410 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8411 legalizeOperandsVALUt16(*NewInstr, MRI);
8412 legalizeOperands(*NewInstr, MDT);
8413 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8414 Inst.eraseFromParent();
8415 return;
8416 }
8417 case AMDGPU::V_S_EXP_F16_e64:
8418 case AMDGPU::V_S_LOG_F16_e64:
8419 case AMDGPU::V_S_RCP_F16_e64:
8420 case AMDGPU::V_S_RSQ_F16_e64:
8421 case AMDGPU::V_S_SQRT_F16_e64: {
8422 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8423 ? &AMDGPU::VGPR_16RegClass
8424 : &AMDGPU::VGPR_32RegClass);
8425 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8426 .add(Inst.getOperand(1)) // src0_modifiers
8427 .add(Inst.getOperand(2))
8428 .add(Inst.getOperand(3)) // clamp
8429 .add(Inst.getOperand(4)) // omod
8430 .setMIFlags(Inst.getFlags());
8431 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8432 NewInstr.addImm(0); // opsel0
8433 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8434 legalizeOperandsVALUt16(*NewInstr, MRI);
8435 legalizeOperands(*NewInstr, MDT);
8436 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8437 Inst.eraseFromParent();
8438 return;
8439 }
8440 }
8441
8442 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8443 // We cannot move this instruction to the VALU, so we should try to
8444 // legalize its operands instead.
8445 legalizeOperands(Inst, MDT);
8446 return;
8447 }
8448 // Handle converting generic instructions like COPY-to-SGPR into
8449 // COPY-to-VGPR.
8450 if (NewOpcode == Opcode) {
8451 Register DstReg = Inst.getOperand(0).getReg();
8452 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8453
8454 if (Inst.isCopy() && DstReg.isPhysical() &&
8455 Inst.getOperand(1).getReg().isVirtual()) {
8456 handleCopyToPhysHelper(Worklist, DstReg, Inst, MRI, WaterFalls,
8457 V2SPhyCopiesToErase);
8458 return;
8459 }
8460
8461 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8462 Register NewDstReg = Inst.getOperand(1).getReg();
8463 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8464 if (const TargetRegisterClass *CommonRC =
8465 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8466 // Instead of creating a copy where src and dst are the same register
8467 // class, we just replace all uses of dst with src. These kinds of
8468 // copies interfere with the heuristics MachineSink uses to decide
8469 // whether or not to split a critical edge. Since the pass assumes
8470 // that copies will end up as machine instructions and not be
8471 // eliminated.
8472 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8473 MRI.replaceRegWith(DstReg, NewDstReg);
8474 MRI.clearKillFlags(NewDstReg);
8475 Inst.getOperand(0).setReg(DstReg);
8476
8477 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8478 llvm_unreachable("failed to constrain register");
8479
8480 Inst.eraseFromParent();
8481
8482 for (MachineOperand &UseMO :
8483 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8484 MachineInstr &UseMI = *UseMO.getParent();
8485
8486 // Legalize t16 operands since replaceReg is called after
8487 // addUsersToVALU.
8489
8490 unsigned OpIdx = UseMI.getOperandNo(&UseMO);
8491 if (const TargetRegisterClass *OpRC =
8492 getRegClass(UseMI.getDesc(), OpIdx))
8493 MRI.constrainRegClass(NewDstReg, OpRC);
8494 }
8495
8496 return;
8497 }
8498 }
8499
8500 // If this is a v2s copy between 16bit and 32bit reg,
8501 // replace vgpr copy to reg_sequence/extract_subreg
8502 // This can be remove after we have sgpr16 in place
8503 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8504 Inst.getOperand(1).getReg().isVirtual() &&
8505 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8506 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8507 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8508 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8509 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8510 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8511 get(AMDGPU::IMPLICIT_DEF), Undef);
8512 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8513 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8514 .addReg(Inst.getOperand(1).getReg())
8515 .addImm(AMDGPU::lo16)
8516 .addReg(Undef)
8517 .addImm(AMDGPU::hi16);
8518 Inst.eraseFromParent();
8519 MRI.replaceRegWith(DstReg, NewDstReg);
8520 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8521 return;
8522 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8523 AMDGPU::lo16)) {
8524 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8525 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8526 MRI.replaceRegWith(DstReg, NewDstReg);
8527 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8528 return;
8529 }
8530 }
8531
8532 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8533 MRI.replaceRegWith(DstReg, NewDstReg);
8534 legalizeOperands(Inst, MDT);
8535 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8536 return;
8537 }
8538
8539 // Use the new VALU Opcode.
8540 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8541 .setMIFlags(Inst.getFlags());
8542 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8543 // Intersperse VOP3 modifiers among the SALU operands.
8544 NewInstr->addOperand(Inst.getOperand(0));
8545 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8546 AMDGPU::OpName::src0_modifiers) >= 0)
8547 NewInstr.addImm(0);
8548 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8549 const MachineOperand &Src = Inst.getOperand(1);
8550 NewInstr->addOperand(Src);
8551 }
8552
8553 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8554 // We are converting these to a BFE, so we need to add the missing
8555 // operands for the size and offset.
8556 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8557 NewInstr.addImm(0);
8558 NewInstr.addImm(Size);
8559 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8560 // The VALU version adds the second operand to the result, so insert an
8561 // extra 0 operand.
8562 NewInstr.addImm(0);
8563 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8564 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8565 // If we need to move this to VGPRs, we need to unpack the second
8566 // operand back into the 2 separate ones for bit offset and width.
8567 assert(OffsetWidthOp.isImm() &&
8568 "Scalar BFE is only implemented for constant width and offset");
8569 uint32_t Imm = OffsetWidthOp.getImm();
8570
8571 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8572 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8573 NewInstr.addImm(Offset);
8574 NewInstr.addImm(BitWidth);
8575 } else {
8576 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8577 AMDGPU::OpName::src1_modifiers) >= 0)
8578 NewInstr.addImm(0);
8579 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8580 NewInstr->addOperand(Inst.getOperand(2));
8581 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8582 AMDGPU::OpName::src2_modifiers) >= 0)
8583 NewInstr.addImm(0);
8584 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8585 NewInstr->addOperand(Inst.getOperand(3));
8586 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8587 NewInstr.addImm(0);
8588 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8589 NewInstr.addImm(0);
8590 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8591 NewInstr.addImm(0);
8592 }
8593 } else {
8594 // Just copy the SALU operands.
8595 for (const MachineOperand &Op : Inst.explicit_operands())
8596 NewInstr->addOperand(Op);
8597 }
8598
8599 // Remove any references to SCC. Vector instructions can't read from it, and
8600 // We're just about to add the implicit use / defs of VCC, and we don't want
8601 // both.
8602 for (MachineOperand &Op : Inst.implicit_operands()) {
8603 if (Op.getReg() == AMDGPU::SCC) {
8604 // Only propagate through live-def of SCC.
8605 if (Op.isDef() && !Op.isDead())
8606 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8607 if (Op.isUse())
8608 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8609 }
8610 }
8611 Inst.eraseFromParent();
8612 Register NewDstReg;
8613 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8614 Register DstReg = NewInstr->getOperand(0).getReg();
8615 assert(DstReg.isVirtual());
8616 // Update the destination register class.
8617 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8618 assert(NewDstRC);
8619 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8620 MRI.replaceRegWith(DstReg, NewDstReg);
8621 }
8622 fixImplicitOperands(*NewInstr);
8623
8624 legalizeOperandsVALUt16(*NewInstr, MRI);
8625
8626 // Legalize the operands
8627 legalizeOperands(*NewInstr, MDT);
8628 if (NewDstReg)
8629 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8630}
8631
8632// Add/sub require special handling to deal with carry outs.
8633std::pair<bool, MachineBasicBlock *>
8634SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8635 MachineDominatorTree *MDT) const {
8636 if (ST.hasAddNoCarryInsts()) {
8637 // Assume there is no user of scc since we don't select this in that case.
8638 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8639 // is used.
8640
8641 MachineBasicBlock &MBB = *Inst.getParent();
8642 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8643
8644 Register OldDstReg = Inst.getOperand(0).getReg();
8645 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8646
8647 unsigned Opc = Inst.getOpcode();
8648 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8649
8650 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8651 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8652
8653 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8654 Inst.removeOperand(3);
8655
8656 Inst.setDesc(get(NewOpc));
8657 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8658 Inst.addImplicitDefUseOperands(*MBB.getParent());
8659 MRI.replaceRegWith(OldDstReg, ResultReg);
8660 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8661
8662 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8663 return std::pair(true, NewBB);
8664 }
8665
8666 return std::pair(false, nullptr);
8667}
8668
8669void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8670 MachineDominatorTree *MDT) const {
8671
8672 MachineBasicBlock &MBB = *Inst.getParent();
8673 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8674 MachineBasicBlock::iterator MII = Inst;
8675 const DebugLoc &DL = Inst.getDebugLoc();
8676
8677 MachineOperand &Dest = Inst.getOperand(0);
8678 MachineOperand &Src0 = Inst.getOperand(1);
8679 MachineOperand &Src1 = Inst.getOperand(2);
8680 MachineOperand &Cond = Inst.getOperand(3);
8681
8682 Register CondReg = Cond.getReg();
8683 bool IsSCC = (CondReg == AMDGPU::SCC);
8684
8685 // If this is a trivial select where the condition is effectively not SCC
8686 // (CondReg is a source of copy to SCC), then the select is semantically
8687 // equivalent to copying CondReg. Hence, there is no need to create
8688 // V_CNDMASK, we can just use that and bail out.
8689 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8690 (Src1.getImm() == 0)) {
8691 MRI.replaceRegWith(Dest.getReg(), CondReg);
8692 return;
8693 }
8694
8695 Register NewCondReg = CondReg;
8696 if (IsSCC) {
8697 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8698 NewCondReg = MRI.createVirtualRegister(TC);
8699
8700 // Now look for the closest SCC def if it is a copy
8701 // replacing the CondReg with the COPY source register
8702 bool CopyFound = false;
8703 for (MachineInstr &CandI :
8705 Inst.getParent()->rend())) {
8706 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8707 -1) {
8708 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8709 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8710 .addReg(CandI.getOperand(1).getReg());
8711 CopyFound = true;
8712 }
8713 break;
8714 }
8715 }
8716 if (!CopyFound) {
8717 // SCC def is not a copy
8718 // Insert a trivial select instead of creating a copy, because a copy from
8719 // SCC would semantically mean just copying a single bit, but we may need
8720 // the result to be a vector condition mask that needs preserving.
8721 unsigned Opcode =
8722 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8723 auto NewSelect =
8724 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8725 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8726 }
8727 }
8728
8729 Register NewDestReg = MRI.createVirtualRegister(
8730 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8731 MachineInstr *NewInst;
8732 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8733 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8734 .addImm(0)
8735 .add(Src1) // False
8736 .addImm(0)
8737 .add(Src0) // True
8738 .addReg(NewCondReg);
8739 } else {
8740 NewInst =
8741 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8742 .add(Src1) // False
8743 .add(Src0) // True
8744 .addReg(NewCondReg);
8745 }
8746 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8747 legalizeOperands(*NewInst, MDT);
8748 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8749}
8750
8751void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8752 MachineInstr &Inst) const {
8753 MachineBasicBlock &MBB = *Inst.getParent();
8754 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8755 MachineBasicBlock::iterator MII = Inst;
8756 const DebugLoc &DL = Inst.getDebugLoc();
8757
8758 MachineOperand &Dest = Inst.getOperand(0);
8759 MachineOperand &Src = Inst.getOperand(1);
8760 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8761 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8762
8763 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8764 : AMDGPU::V_SUB_CO_U32_e32;
8765
8766 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8767 .addImm(0)
8768 .addReg(Src.getReg());
8769
8770 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8771 .addReg(Src.getReg())
8772 .addReg(TmpReg);
8773
8774 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8775 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8776}
8777
8778void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8779 MachineInstr &Inst) const {
8780 MachineBasicBlock &MBB = *Inst.getParent();
8781 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8782 MachineBasicBlock::iterator MII = Inst;
8783 const DebugLoc &DL = Inst.getDebugLoc();
8784
8785 MachineOperand &Dest = Inst.getOperand(0);
8786 MachineOperand &Src1 = Inst.getOperand(1);
8787 MachineOperand &Src2 = Inst.getOperand(2);
8788 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8789 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8790 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8791
8792 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8793 : AMDGPU::V_SUB_CO_U32_e32;
8794
8795 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8796 .addReg(Src1.getReg())
8797 .addReg(Src2.getReg());
8798
8799 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8800
8801 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8802 .addReg(SubResultReg)
8803 .addReg(TmpReg);
8804
8805 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8806 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8807}
8808
8809void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8810 MachineInstr &Inst) const {
8811 MachineBasicBlock &MBB = *Inst.getParent();
8812 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8813 MachineBasicBlock::iterator MII = Inst;
8814 const DebugLoc &DL = Inst.getDebugLoc();
8815
8816 MachineOperand &Dest = Inst.getOperand(0);
8817 MachineOperand &Src0 = Inst.getOperand(1);
8818 MachineOperand &Src1 = Inst.getOperand(2);
8819
8820 if (ST.hasDLInsts()) {
8821 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8822 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8823 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8824
8825 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8826 .add(Src0)
8827 .add(Src1);
8828
8829 MRI.replaceRegWith(Dest.getReg(), NewDest);
8830 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8831 } else {
8832 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8833 // invert either source and then perform the XOR. If either source is a
8834 // scalar register, then we can leave the inversion on the scalar unit to
8835 // achieve a better distribution of scalar and vector instructions.
8836 bool Src0IsSGPR = Src0.isReg() &&
8837 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8838 bool Src1IsSGPR = Src1.isReg() &&
8839 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8840 MachineInstr *Xor;
8841 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8842 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8843
8844 // Build a pair of scalar instructions and add them to the work list.
8845 // The next iteration over the work list will lower these to the vector
8846 // unit as necessary.
8847 if (Src0IsSGPR) {
8848 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8849 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8850 .addReg(Temp)
8851 .add(Src1);
8852 } else if (Src1IsSGPR) {
8853 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8854 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8855 .add(Src0)
8856 .addReg(Temp);
8857 } else {
8858 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8859 .add(Src0)
8860 .add(Src1);
8861 MachineInstr *Not =
8862 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8863 Worklist.insert(Not);
8864 }
8865
8866 MRI.replaceRegWith(Dest.getReg(), NewDest);
8867
8868 Worklist.insert(Xor);
8869
8870 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8871 }
8872}
8873
8874void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8875 MachineInstr &Inst,
8876 unsigned Opcode) const {
8877 MachineBasicBlock &MBB = *Inst.getParent();
8878 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8879 MachineBasicBlock::iterator MII = Inst;
8880 const DebugLoc &DL = Inst.getDebugLoc();
8881
8882 MachineOperand &Dest = Inst.getOperand(0);
8883 MachineOperand &Src0 = Inst.getOperand(1);
8884 MachineOperand &Src1 = Inst.getOperand(2);
8885
8886 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8887 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8888
8889 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8890 .add(Src0)
8891 .add(Src1);
8892
8893 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8894 .addReg(Interm);
8895
8896 Worklist.insert(&Op);
8897 Worklist.insert(&Not);
8898
8899 MRI.replaceRegWith(Dest.getReg(), NewDest);
8900 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8901}
8902
8903void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8904 MachineInstr &Inst,
8905 unsigned Opcode) const {
8906 MachineBasicBlock &MBB = *Inst.getParent();
8907 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8908 MachineBasicBlock::iterator MII = Inst;
8909 const DebugLoc &DL = Inst.getDebugLoc();
8910
8911 MachineOperand &Dest = Inst.getOperand(0);
8912 MachineOperand &Src0 = Inst.getOperand(1);
8913 MachineOperand &Src1 = Inst.getOperand(2);
8914
8915 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8916 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8917
8918 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8919 .add(Src1);
8920
8921 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8922 .add(Src0)
8923 .addReg(Interm);
8924
8925 Worklist.insert(&Not);
8926 Worklist.insert(&Op);
8927
8928 MRI.replaceRegWith(Dest.getReg(), NewDest);
8929 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8930}
8931
8932void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8933 MachineInstr &Inst, unsigned Opcode,
8934 bool Swap) const {
8935 MachineBasicBlock &MBB = *Inst.getParent();
8936 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8937
8938 MachineOperand &Dest = Inst.getOperand(0);
8939 MachineOperand &Src0 = Inst.getOperand(1);
8940 const DebugLoc &DL = Inst.getDebugLoc();
8941
8942 MachineBasicBlock::iterator MII = Inst;
8943
8944 const MCInstrDesc &InstDesc = get(Opcode);
8945 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8946 MRI.getRegClass(Src0.getReg()) :
8947 &AMDGPU::SGPR_32RegClass;
8948
8949 const TargetRegisterClass *Src0SubRC =
8950 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8951
8952 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8953 AMDGPU::sub0, Src0SubRC);
8954
8955 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8956 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8957 const TargetRegisterClass *NewDestSubRC =
8958 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8959
8960 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8961 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8962
8963 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8964 AMDGPU::sub1, Src0SubRC);
8965
8966 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8967 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8968
8969 if (Swap)
8970 std::swap(DestSub0, DestSub1);
8971
8972 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8973 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8974 .addReg(DestSub0)
8975 .addImm(AMDGPU::sub0)
8976 .addReg(DestSub1)
8977 .addImm(AMDGPU::sub1);
8978
8979 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8980
8981 Worklist.insert(&LoHalf);
8982 Worklist.insert(&HiHalf);
8983
8984 // We don't need to legalizeOperands here because for a single operand, src0
8985 // will support any kind of input.
8986
8987 // Move all users of this moved value.
8988 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8989}
8990
8991// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8992// split the s_mul_u64 in 32-bit vector multiplications.
8993void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8994 MachineInstr &Inst,
8995 MachineDominatorTree *MDT) const {
8996 MachineBasicBlock &MBB = *Inst.getParent();
8997 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8998
8999 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9000 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9001 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9002
9003 MachineOperand &Dest = Inst.getOperand(0);
9004 MachineOperand &Src0 = Inst.getOperand(1);
9005 MachineOperand &Src1 = Inst.getOperand(2);
9006 const DebugLoc &DL = Inst.getDebugLoc();
9007 MachineBasicBlock::iterator MII = Inst;
9008
9009 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9010 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9011 const TargetRegisterClass *Src0SubRC =
9012 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9013 if (RI.isSGPRClass(Src0SubRC))
9014 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9015 const TargetRegisterClass *Src1SubRC =
9016 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9017 if (RI.isSGPRClass(Src1SubRC))
9018 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9019
9020 // First, we extract the low 32-bit and high 32-bit values from each of the
9021 // operands.
9022 MachineOperand Op0L =
9023 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9024 MachineOperand Op1L =
9025 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9026 MachineOperand Op0H =
9027 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
9028 MachineOperand Op1H =
9029 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
9030
9031 // The multilication is done as follows:
9032 //
9033 // Op1H Op1L
9034 // * Op0H Op0L
9035 // --------------------
9036 // Op1H*Op0L Op1L*Op0L
9037 // + Op1H*Op0H Op1L*Op0H
9038 // -----------------------------------------
9039 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
9040 //
9041 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
9042 // value and that would overflow.
9043 // The low 32-bit value is Op1L*Op0L.
9044 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
9045
9046 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9047 MachineInstr *Op1L_Op0H =
9048 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
9049 .add(Op1L)
9050 .add(Op0H);
9051
9052 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9053 MachineInstr *Op1H_Op0L =
9054 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
9055 .add(Op1H)
9056 .add(Op0L);
9057
9058 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9059 MachineInstr *Carry =
9060 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
9061 .add(Op1L)
9062 .add(Op0L);
9063
9064 MachineInstr *LoHalf =
9065 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9066 .add(Op1L)
9067 .add(Op0L);
9068
9069 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9070 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
9071 .addReg(Op1L_Op0H_Reg)
9072 .addReg(Op1H_Op0L_Reg);
9073
9074 MachineInstr *HiHalf =
9075 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
9076 .addReg(AddReg)
9077 .addReg(CarryReg);
9078
9079 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9080 .addReg(DestSub0)
9081 .addImm(AMDGPU::sub0)
9082 .addReg(DestSub1)
9083 .addImm(AMDGPU::sub1);
9084
9085 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9086
9087 // Try to legalize the operands in case we need to swap the order to keep it
9088 // valid.
9089 legalizeOperands(*Op1L_Op0H, MDT);
9090 legalizeOperands(*Op1H_Op0L, MDT);
9091 legalizeOperands(*Carry, MDT);
9092 legalizeOperands(*LoHalf, MDT);
9093 legalizeOperands(*Add, MDT);
9094 legalizeOperands(*HiHalf, MDT);
9095
9096 // Move all users of this moved value.
9097 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9098}
9099
9100// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
9101// multiplications.
9102void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
9103 MachineInstr &Inst,
9104 MachineDominatorTree *MDT) const {
9105 MachineBasicBlock &MBB = *Inst.getParent();
9106 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9107
9108 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9109 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9110 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9111
9112 MachineOperand &Dest = Inst.getOperand(0);
9113 MachineOperand &Src0 = Inst.getOperand(1);
9114 MachineOperand &Src1 = Inst.getOperand(2);
9115 const DebugLoc &DL = Inst.getDebugLoc();
9116 MachineBasicBlock::iterator MII = Inst;
9117
9118 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9119 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9120 const TargetRegisterClass *Src0SubRC =
9121 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9122 if (RI.isSGPRClass(Src0SubRC))
9123 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9124 const TargetRegisterClass *Src1SubRC =
9125 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9126 if (RI.isSGPRClass(Src1SubRC))
9127 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9128
9129 // First, we extract the low 32-bit and high 32-bit values from each of the
9130 // operands.
9131 MachineOperand Op0L =
9132 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9133 MachineOperand Op1L =
9134 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9135
9136 unsigned Opc = Inst.getOpcode();
9137 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
9138 ? AMDGPU::V_MUL_HI_U32_e64
9139 : AMDGPU::V_MUL_HI_I32_e64;
9140 MachineInstr *HiHalf =
9141 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
9142
9143 MachineInstr *LoHalf =
9144 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9145 .add(Op1L)
9146 .add(Op0L);
9147
9148 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9149 .addReg(DestSub0)
9150 .addImm(AMDGPU::sub0)
9151 .addReg(DestSub1)
9152 .addImm(AMDGPU::sub1);
9153
9154 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9155
9156 // Try to legalize the operands in case we need to swap the order to keep it
9157 // valid.
9158 legalizeOperands(*HiHalf, MDT);
9159 legalizeOperands(*LoHalf, MDT);
9160
9161 // Move all users of this moved value.
9162 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9163}
9164
9165void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9166 MachineInstr &Inst, unsigned Opcode,
9167 MachineDominatorTree *MDT) const {
9168 MachineBasicBlock &MBB = *Inst.getParent();
9169 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9170
9171 MachineOperand &Dest = Inst.getOperand(0);
9172 MachineOperand &Src0 = Inst.getOperand(1);
9173 MachineOperand &Src1 = Inst.getOperand(2);
9174 const DebugLoc &DL = Inst.getDebugLoc();
9175
9176 MachineBasicBlock::iterator MII = Inst;
9177
9178 const MCInstrDesc &InstDesc = get(Opcode);
9179 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9180 MRI.getRegClass(Src0.getReg()) :
9181 &AMDGPU::SGPR_32RegClass;
9182
9183 const TargetRegisterClass *Src0SubRC =
9184 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9185 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9186 MRI.getRegClass(Src1.getReg()) :
9187 &AMDGPU::SGPR_32RegClass;
9188
9189 const TargetRegisterClass *Src1SubRC =
9190 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9191
9192 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9193 AMDGPU::sub0, Src0SubRC);
9194 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9195 AMDGPU::sub0, Src1SubRC);
9196 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9197 AMDGPU::sub1, Src0SubRC);
9198 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9199 AMDGPU::sub1, Src1SubRC);
9200
9201 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9202 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9203 const TargetRegisterClass *NewDestSubRC =
9204 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9205
9206 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9207 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9208 .add(SrcReg0Sub0)
9209 .add(SrcReg1Sub0);
9210
9211 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9212 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9213 .add(SrcReg0Sub1)
9214 .add(SrcReg1Sub1);
9215
9216 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9217 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9218 .addReg(DestSub0)
9219 .addImm(AMDGPU::sub0)
9220 .addReg(DestSub1)
9221 .addImm(AMDGPU::sub1);
9222
9223 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9224
9225 Worklist.insert(&LoHalf);
9226 Worklist.insert(&HiHalf);
9227
9228 // Move all users of this moved value.
9229 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9230}
9231
9232void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9233 MachineInstr &Inst,
9234 MachineDominatorTree *MDT) const {
9235 MachineBasicBlock &MBB = *Inst.getParent();
9236 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9237
9238 MachineOperand &Dest = Inst.getOperand(0);
9239 MachineOperand &Src0 = Inst.getOperand(1);
9240 MachineOperand &Src1 = Inst.getOperand(2);
9241 const DebugLoc &DL = Inst.getDebugLoc();
9242
9243 MachineBasicBlock::iterator MII = Inst;
9244
9245 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9246
9247 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9248
9249 MachineOperand* Op0;
9250 MachineOperand* Op1;
9251
9252 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9253 Op0 = &Src0;
9254 Op1 = &Src1;
9255 } else {
9256 Op0 = &Src1;
9257 Op1 = &Src0;
9258 }
9259
9260 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9261 .add(*Op0);
9262
9263 Register NewDest = MRI.createVirtualRegister(DestRC);
9264
9265 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9266 .addReg(Interm)
9267 .add(*Op1);
9268
9269 MRI.replaceRegWith(Dest.getReg(), NewDest);
9270
9271 Worklist.insert(&Xor);
9272}
9273
9274void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9275 MachineInstr &Inst) const {
9276 MachineBasicBlock &MBB = *Inst.getParent();
9277 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9278
9279 MachineBasicBlock::iterator MII = Inst;
9280 const DebugLoc &DL = Inst.getDebugLoc();
9281
9282 MachineOperand &Dest = Inst.getOperand(0);
9283 MachineOperand &Src = Inst.getOperand(1);
9284
9285 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9286 const TargetRegisterClass *SrcRC = Src.isReg() ?
9287 MRI.getRegClass(Src.getReg()) :
9288 &AMDGPU::SGPR_32RegClass;
9289
9290 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9291 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9292
9293 const TargetRegisterClass *SrcSubRC =
9294 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9295
9296 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9297 AMDGPU::sub0, SrcSubRC);
9298 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9299 AMDGPU::sub1, SrcSubRC);
9300
9301 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9302
9303 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9304
9305 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9306
9307 // We don't need to legalize operands here. src0 for either instruction can be
9308 // an SGPR, and the second input is unused or determined here.
9309 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9310}
9311
9312void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9313 MachineInstr &Inst) const {
9314 MachineBasicBlock &MBB = *Inst.getParent();
9315 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9316 MachineBasicBlock::iterator MII = Inst;
9317 const DebugLoc &DL = Inst.getDebugLoc();
9318
9319 MachineOperand &Dest = Inst.getOperand(0);
9320 uint32_t Imm = Inst.getOperand(2).getImm();
9321 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9322 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9323
9324 (void) Offset;
9325
9326 // Only sext_inreg cases handled.
9327 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9328 Offset == 0 && "Not implemented");
9329
9330 if (BitWidth < 32) {
9331 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9332 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9333 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9334
9335 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9336 .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
9337 .addImm(0)
9338 .addImm(BitWidth);
9339
9340 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9341 .addImm(31)
9342 .addReg(MidRegLo);
9343
9344 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9345 .addReg(MidRegLo)
9346 .addImm(AMDGPU::sub0)
9347 .addReg(MidRegHi)
9348 .addImm(AMDGPU::sub1);
9349
9350 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9351 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9352 return;
9353 }
9354
9355 MachineOperand &Src = Inst.getOperand(1);
9356 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9357 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9358
9359 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9360 .addImm(31)
9361 .addReg(Src.getReg(), {}, AMDGPU::sub0);
9362
9363 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9364 .addReg(Src.getReg(), {}, AMDGPU::sub0)
9365 .addImm(AMDGPU::sub0)
9366 .addReg(TmpReg)
9367 .addImm(AMDGPU::sub1);
9368
9369 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9370 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9371}
9372
9373void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9374 MachineInstr &Inst, unsigned Opcode,
9375 MachineDominatorTree *MDT) const {
9376 // (S_FLBIT_I32_B64 hi:lo) ->
9377 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9378 // (S_FF1_I32_B64 hi:lo) ->
9379 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9380
9381 MachineBasicBlock &MBB = *Inst.getParent();
9382 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9383 MachineBasicBlock::iterator MII = Inst;
9384 const DebugLoc &DL = Inst.getDebugLoc();
9385
9386 MachineOperand &Dest = Inst.getOperand(0);
9387 MachineOperand &Src = Inst.getOperand(1);
9388
9389 const MCInstrDesc &InstDesc = get(Opcode);
9390
9391 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9392 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9393 : AMDGPU::V_ADD_CO_U32_e32;
9394
9395 const TargetRegisterClass *SrcRC =
9396 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9397 const TargetRegisterClass *SrcSubRC =
9398 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9399
9400 MachineOperand SrcRegSub0 =
9401 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9402 MachineOperand SrcRegSub1 =
9403 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9404
9405 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9406 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9407 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9408 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9409
9410 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9411
9412 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9413
9414 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9415 .addReg(IsCtlz ? MidReg1 : MidReg2)
9416 .addImm(32)
9417 .addImm(1); // enable clamp
9418
9419 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9420 .addReg(MidReg3)
9421 .addReg(IsCtlz ? MidReg2 : MidReg1);
9422
9423 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9424
9425 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9426}
9427
9428void SIInstrInfo::addUsersToMoveToVALUWorklist(
9429 Register DstReg, MachineRegisterInfo &MRI,
9430 SIInstrWorklist &Worklist) const {
9431 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9432 MachineInstr &UseMI = *MO.getParent();
9433
9434 unsigned OpNo = 0;
9435
9436 switch (UseMI.getOpcode()) {
9437 case AMDGPU::COPY:
9438 case AMDGPU::WQM:
9439 case AMDGPU::SOFT_WQM:
9440 case AMDGPU::STRICT_WWM:
9441 case AMDGPU::STRICT_WQM:
9442 case AMDGPU::REG_SEQUENCE:
9443 case AMDGPU::PHI:
9444 case AMDGPU::INSERT_SUBREG:
9445 break;
9446 default:
9447 OpNo = MO.getOperandNo();
9448 break;
9449 }
9450
9451 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9452 MRI.constrainRegClass(DstReg, OpRC);
9453
9454 if (!RI.hasVectorRegisters(OpRC))
9455 Worklist.insert(&UseMI);
9456 else
9457 // Legalization could change user list.
9458 legalizeOperandsVALUt16(UseMI, OpNo, MRI);
9459 }
9460}
9461
9462void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9464 MachineInstr &Inst) const {
9465 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9466 MachineBasicBlock *MBB = Inst.getParent();
9467 MachineOperand &Src0 = Inst.getOperand(1);
9468 MachineOperand &Src1 = Inst.getOperand(2);
9469 const DebugLoc &DL = Inst.getDebugLoc();
9470
9471 if (ST.useRealTrue16Insts()) {
9472 Register SrcReg0, SrcReg1;
9473 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9474 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9475 BuildMI(*MBB, Inst, DL,
9476 get(Src0.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg0)
9477 .add(Src0);
9478 } else {
9479 SrcReg0 = Src0.getReg();
9480 }
9481
9482 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9483 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9484 BuildMI(*MBB, Inst, DL,
9485 get(Src1.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg1)
9486 .add(Src1);
9487 } else {
9488 SrcReg1 = Src1.getReg();
9489 }
9490
9491 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9492 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9493
9494 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9495 switch (Inst.getOpcode()) {
9496 case AMDGPU::S_PACK_LL_B32_B16:
9497 NewMI
9498 .addReg(SrcReg0, {},
9499 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9500 .addImm(AMDGPU::lo16)
9501 .addReg(SrcReg1, {},
9502 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9503 .addImm(AMDGPU::hi16);
9504 break;
9505 case AMDGPU::S_PACK_LH_B32_B16:
9506 NewMI
9507 .addReg(SrcReg0, {},
9508 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9509 .addImm(AMDGPU::lo16)
9510 .addReg(SrcReg1, {}, AMDGPU::hi16)
9511 .addImm(AMDGPU::hi16);
9512 break;
9513 case AMDGPU::S_PACK_HL_B32_B16:
9514 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9515 .addImm(AMDGPU::lo16)
9516 .addReg(SrcReg1, {},
9517 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9518 .addImm(AMDGPU::hi16);
9519 break;
9520 case AMDGPU::S_PACK_HH_B32_B16:
9521 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9522 .addImm(AMDGPU::lo16)
9523 .addReg(SrcReg1, {}, AMDGPU::hi16)
9524 .addImm(AMDGPU::hi16);
9525 break;
9526 default:
9527 llvm_unreachable("unhandled s_pack_* instruction");
9528 }
9529
9530 MachineOperand &Dest = Inst.getOperand(0);
9531 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9532 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9533 return;
9534 }
9535
9536 switch (Inst.getOpcode()) {
9537 case AMDGPU::S_PACK_LL_B32_B16: {
9538 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9539 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9540
9541 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9542 // 0.
9543 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9544 .addImm(0xffff);
9545
9546 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9547 .addReg(ImmReg, RegState::Kill)
9548 .add(Src0);
9549
9550 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9551 .add(Src1)
9552 .addImm(16)
9553 .addReg(TmpReg, RegState::Kill);
9554 break;
9555 }
9556 case AMDGPU::S_PACK_LH_B32_B16: {
9557 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9558 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9559 .addImm(0xffff);
9560 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9561 .addReg(ImmReg, RegState::Kill)
9562 .add(Src0)
9563 .add(Src1);
9564 break;
9565 }
9566 case AMDGPU::S_PACK_HL_B32_B16: {
9567 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9568 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9569 .addImm(16)
9570 .add(Src0);
9571 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9572 .add(Src1)
9573 .addImm(16)
9574 .addReg(TmpReg, RegState::Kill);
9575 break;
9576 }
9577 case AMDGPU::S_PACK_HH_B32_B16: {
9578 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9579 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9580 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9581 .addImm(16)
9582 .add(Src0);
9583 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9584 .addImm(0xffff0000);
9585 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9586 .add(Src1)
9587 .addReg(ImmReg, RegState::Kill)
9588 .addReg(TmpReg, RegState::Kill);
9589 break;
9590 }
9591 default:
9592 llvm_unreachable("unhandled s_pack_* instruction");
9593 }
9594
9595 MachineOperand &Dest = Inst.getOperand(0);
9596 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9597 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9598}
9599
9600void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9601 MachineInstr &SCCDefInst,
9602 SIInstrWorklist &Worklist,
9603 Register NewCond) const {
9604
9605 // Ensure that def inst defines SCC, which is still live.
9606 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9607 !Op.isDead() && Op.getParent() == &SCCDefInst);
9608 SmallVector<MachineInstr *, 4> CopyToDelete;
9609 // This assumes that all the users of SCC are in the same block
9610 // as the SCC def.
9611 for (MachineInstr &MI : // Skip the def inst itself.
9612 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9613 SCCDefInst.getParent()->end())) {
9614 // Check if SCC is used first.
9615 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9616 if (SCCIdx != -1) {
9617 if (MI.isCopy()) {
9618 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9619 Register DestReg = MI.getOperand(0).getReg();
9620
9621 MRI.replaceRegWith(DestReg, NewCond);
9622 CopyToDelete.push_back(&MI);
9623 } else {
9624
9625 if (NewCond.isValid())
9626 MI.getOperand(SCCIdx).setReg(NewCond);
9627
9628 Worklist.insert(&MI);
9629 }
9630 }
9631 // Exit if we find another SCC def.
9632 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9633 break;
9634 }
9635 for (auto &Copy : CopyToDelete)
9636 Copy->eraseFromParent();
9637}
9638
9639// Instructions that use SCC may be converted to VALU instructions. When that
9640// happens, the SCC register is changed to VCC_LO. The instruction that defines
9641// SCC must be changed to an instruction that defines VCC. This function makes
9642// sure that the instruction that defines SCC is added to the moveToVALU
9643// worklist.
9644void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9645 SIInstrWorklist &Worklist) const {
9646 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9647 // then there is nothing to do because the defining instruction has been
9648 // converted to a VALU already. If SCC then that instruction needs to be
9649 // converted to a VALU.
9650 for (MachineInstr &MI :
9651 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9652 SCCUseInst->getParent()->rend())) {
9653 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9654 break;
9655 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9656 Worklist.insert(&MI);
9657 break;
9658 }
9659 }
9660}
9661
9662const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9663 const MachineInstr &Inst) const {
9664 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9665
9666 switch (Inst.getOpcode()) {
9667 // For target instructions, getOpRegClass just returns the virtual register
9668 // class associated with the operand, so we need to find an equivalent VGPR
9669 // register class in order to move the instruction to the VALU.
9670 case AMDGPU::COPY:
9671 case AMDGPU::PHI:
9672 case AMDGPU::REG_SEQUENCE:
9673 case AMDGPU::INSERT_SUBREG:
9674 case AMDGPU::WQM:
9675 case AMDGPU::SOFT_WQM:
9676 case AMDGPU::STRICT_WWM:
9677 case AMDGPU::STRICT_WQM: {
9678 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9679 if (RI.isAGPRClass(SrcRC)) {
9680 if (RI.isAGPRClass(NewDstRC))
9681 return nullptr;
9682
9683 switch (Inst.getOpcode()) {
9684 case AMDGPU::PHI:
9685 case AMDGPU::REG_SEQUENCE:
9686 case AMDGPU::INSERT_SUBREG:
9687 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9688 break;
9689 default:
9690 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9691 }
9692
9693 if (!NewDstRC)
9694 return nullptr;
9695 } else {
9696 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9697 return nullptr;
9698
9699 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9700 if (!NewDstRC)
9701 return nullptr;
9702 }
9703
9704 return NewDstRC;
9705 }
9706 default:
9707 return NewDstRC;
9708 }
9709}
9710
9711// Find the one SGPR operand we are allowed to use.
9712Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9713 int OpIndices[3]) const {
9714 const MCInstrDesc &Desc = MI.getDesc();
9715
9716 // Find the one SGPR operand we are allowed to use.
9717 //
9718 // First we need to consider the instruction's operand requirements before
9719 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9720 // of VCC, but we are still bound by the constant bus requirement to only use
9721 // one.
9722 //
9723 // If the operand's class is an SGPR, we can never move it.
9724
9725 Register SGPRReg = findImplicitSGPRRead(MI);
9726 if (SGPRReg)
9727 return SGPRReg;
9728
9729 Register UsedSGPRs[3] = {Register()};
9730 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9731
9732 for (unsigned i = 0; i < 3; ++i) {
9733 int Idx = OpIndices[i];
9734 if (Idx == -1)
9735 break;
9736
9737 const MachineOperand &MO = MI.getOperand(Idx);
9738 if (!MO.isReg())
9739 continue;
9740
9741 // Is this operand statically required to be an SGPR based on the operand
9742 // constraints?
9743 const TargetRegisterClass *OpRC =
9744 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9745 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9746 if (IsRequiredSGPR)
9747 return MO.getReg();
9748
9749 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9750 Register Reg = MO.getReg();
9751 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9752 if (RI.isSGPRClass(RegRC))
9753 UsedSGPRs[i] = Reg;
9754 }
9755
9756 // We don't have a required SGPR operand, so we have a bit more freedom in
9757 // selecting operands to move.
9758
9759 // Try to select the most used SGPR. If an SGPR is equal to one of the
9760 // others, we choose that.
9761 //
9762 // e.g.
9763 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9764 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9765
9766 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9767 // prefer those.
9768
9769 if (UsedSGPRs[0]) {
9770 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9771 SGPRReg = UsedSGPRs[0];
9772 }
9773
9774 if (!SGPRReg && UsedSGPRs[1]) {
9775 if (UsedSGPRs[1] == UsedSGPRs[2])
9776 SGPRReg = UsedSGPRs[1];
9777 }
9778
9779 return SGPRReg;
9780}
9781
9783 AMDGPU::OpName OperandName) const {
9784 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9785 return nullptr;
9786
9787 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9788 if (Idx == -1)
9789 return nullptr;
9790
9791 return &MI.getOperand(Idx);
9792}
9793
9795 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9796 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9799 return (Format << 44) |
9800 (1ULL << 56) | // RESOURCE_LEVEL = 1
9801 (3ULL << 60); // OOB_SELECT = 3
9802 }
9803
9804 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9805 if (ST.isAmdHsaOS()) {
9806 // Set ATC = 1. GFX9 doesn't have this bit.
9807 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9808 RsrcDataFormat |= (1ULL << 56);
9809
9810 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9811 // BTW, it disables TC L2 and therefore decreases performance.
9812 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9813 RsrcDataFormat |= (2ULL << 59);
9814 }
9815
9816 return RsrcDataFormat;
9817}
9818
9822 0xffffffff; // Size;
9823
9824 // GFX9 doesn't have ELEMENT_SIZE.
9825 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9826 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9827 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9828 }
9829
9830 // IndexStride = 64 / 32.
9831 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9832 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9833
9834 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9835 // Clear them unless we want a huge stride.
9836 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9837 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9838 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9839
9840 return Rsrc23;
9841}
9842
9844 unsigned Opc = MI.getOpcode();
9845
9846 return isSMRD(Opc);
9847}
9848
9850 return get(Opc).mayLoad() &&
9851 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9852}
9853
9855 TypeSize &MemBytes) const {
9856 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9857 if (!Addr || !Addr->isFI())
9858 return Register();
9859
9860 assert(!MI.memoperands_empty() &&
9861 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9862
9863 FrameIndex = Addr->getIndex();
9864
9865 int VDataIdx =
9866 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
9867 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), VDataIdx));
9868 return MI.getOperand(VDataIdx).getReg();
9869}
9870
9872 TypeSize &MemBytes) const {
9873 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9874 assert(Addr && Addr->isFI());
9875 FrameIndex = Addr->getIndex();
9876
9877 int DataIdx =
9878 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::data);
9879 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), DataIdx));
9880 return MI.getOperand(DataIdx).getReg();
9881}
9882
9884 int &FrameIndex,
9885 TypeSize &MemBytes) const {
9886 if (!MI.mayLoad())
9887 return Register();
9888
9889 if (isMUBUF(MI) || isVGPRSpill(MI))
9890 return isStackAccess(MI, FrameIndex, MemBytes);
9891
9892 if (isSGPRSpill(MI))
9893 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9894
9895 return Register();
9896}
9897
9899 int &FrameIndex,
9900 TypeSize &MemBytes) const {
9901 if (!MI.mayStore())
9902 return Register();
9903
9904 if (isMUBUF(MI) || isVGPRSpill(MI))
9905 return isStackAccess(MI, FrameIndex, MemBytes);
9906
9907 if (isSGPRSpill(MI))
9908 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9909
9910 return Register();
9911}
9912
9914 unsigned Opc = MI.getOpcode();
9916 unsigned DescSize = Desc.getSize();
9917
9918 // If we have a definitive size, we can use it. Otherwise we need to inspect
9919 // the operands to know the size.
9920 if (isFixedSize(MI)) {
9921 unsigned Size = DescSize;
9922
9923 // If we hit the buggy offset, an extra nop will be inserted in MC so
9924 // estimate the worst case.
9925 if (MI.isBranch() && ST.hasOffset3fBug())
9926 Size += 4;
9927
9928 return Size;
9929 }
9930
9931 // Instructions may have a 32-bit literal encoded after them. Check
9932 // operands that could ever be literals.
9933 if (isVALU(MI) || isSALU(MI)) {
9934 if (isDPP(MI))
9935 return DescSize;
9936 bool HasLiteral = false;
9937 unsigned LiteralSize = 4;
9938 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9939 const MachineOperand &Op = MI.getOperand(I);
9940 const MCOperandInfo &OpInfo = Desc.operands()[I];
9941 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9942 HasLiteral = true;
9943 if (ST.has64BitLiterals()) {
9944 switch (OpInfo.OperandType) {
9945 default:
9946 break;
9948 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9949 LiteralSize = 8;
9950 break;
9952 // A 32-bit literal is only valid when the value fits in BOTH signed
9953 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
9954 // emitter's getLit64Encoding logic. This is because of the lack of
9955 // abilility to tell signedness of the literal, therefore we need to
9956 // be conservative and assume values outside this range require a
9957 // 64-bit literal encoding (8 bytes).
9958 if (!Op.isImm() || !isInt<32>(Op.getImm()) ||
9959 !isUInt<32>(Op.getImm()))
9960 LiteralSize = 8;
9961 break;
9962 }
9963 }
9964 break;
9965 }
9966 }
9967 return HasLiteral ? DescSize + LiteralSize : DescSize;
9968 }
9969
9970 // Check whether we have extra NSA words.
9971 if (isMIMG(MI)) {
9972 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9973 if (VAddr0Idx < 0)
9974 return 8;
9975
9976 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9977 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9978 }
9979
9980 switch (Opc) {
9981 case TargetOpcode::BUNDLE:
9982 return getInstBundleSize(MI);
9983 case TargetOpcode::INLINEASM:
9984 case TargetOpcode::INLINEASM_BR: {
9985 const MachineFunction *MF = MI.getMF();
9986 const char *AsmStr = MI.getOperand(0).getSymbolName();
9987 return getInlineAsmLength(AsmStr, MF->getTarget().getMCAsmInfo(), &ST);
9988 }
9989 default:
9990 if (MI.isMetaInstruction())
9991 return 0;
9992
9993 // If D16 Pseudo inst, get correct MC code size
9994 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9995 if (D16Info) {
9996 // Assume d16_lo/hi inst are always in same size
9997 unsigned LoInstOpcode = D16Info->LoOp;
9998 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9999 DescSize = Desc.getSize();
10000 }
10001
10002 // If FMA Pseudo inst, get correct MC code size
10003 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
10004 // All potential lowerings are the same size; arbitrarily pick one.
10005 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
10006 DescSize = Desc.getSize();
10007 }
10008
10009 return DescSize;
10010 }
10011}
10012
10015 if (MI.isBranch() && ST.hasOffset3fBug())
10016 return InstSizeVerifyMode::NoVerify;
10017 return InstSizeVerifyMode::ExactSize;
10018}
10019
10021 if (!isFLAT(MI))
10022 return false;
10023
10024 if (MI.memoperands_empty())
10025 return true;
10026
10027 for (const MachineMemOperand *MMO : MI.memoperands()) {
10028 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
10029 return true;
10030 }
10031 return false;
10032}
10033
10036 static const std::pair<int, const char *> TargetIndices[] = {
10037 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
10038 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
10039 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
10040 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
10041 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
10042 return ArrayRef(TargetIndices);
10043}
10044
10045/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
10046/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
10049 const ScheduleDAG *DAG) const {
10050 return new GCNHazardRecognizer(DAG->MF);
10051}
10052
10053/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
10054/// pass.
10057 MachineLoopInfo *MLI) const {
10058 return new GCNHazardRecognizer(MF, MLI);
10059}
10060
10061// Called during:
10062// - pre-RA scheduling and post-RA scheduling
10065 const ScheduleDAGMI *DAG) const {
10066 // Borrowed from Arm Target
10067 // We would like to restrict this hazard recognizer to only
10068 // post-RA scheduling; we can tell that we're post-RA because we don't
10069 // track VRegLiveness.
10070 if (!DAG->hasVRegLiveness())
10071 return new GCNHazardRecognizer(DAG->MF);
10073}
10074
10075std::pair<unsigned, unsigned>
10077 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
10078}
10079
10082 static const std::pair<unsigned, const char *> TargetFlags[] = {
10083 {MO_GOTPCREL, "amdgpu-gotprel"},
10084 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
10085 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
10086 {MO_GOTPCREL64, "amdgpu-gotprel64"},
10087 {MO_REL32_LO, "amdgpu-rel32-lo"},
10088 {MO_REL32_HI, "amdgpu-rel32-hi"},
10089 {MO_REL64, "amdgpu-rel64"},
10090 {MO_ABS32_LO, "amdgpu-abs32-lo"},
10091 {MO_ABS32_HI, "amdgpu-abs32-hi"},
10092 {MO_ABS64, "amdgpu-abs64"},
10093 };
10094
10095 return ArrayRef(TargetFlags);
10096}
10097
10100 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
10101 {
10102 {MONoClobber, "amdgpu-noclobber"},
10103 {MOLastUse, "amdgpu-last-use"},
10104 {MOCooperative, "amdgpu-cooperative"},
10105 {MOThreadPrivate, "amdgpu-thread-private"},
10106 };
10107
10108 return ArrayRef(TargetFlags);
10109}
10110
10112 const MachineFunction &MF) const {
10114 assert(SrcReg.isVirtual());
10115 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
10116 return AMDGPU::WWM_COPY;
10117
10118 return AMDGPU::COPY;
10119}
10120
10122 uint32_t Opcode = MI.getOpcode();
10123 // Check if it is SGPR spill or wwm-register spill Opcode.
10124 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
10125 return true;
10126
10127 const MachineFunction *MF = MI.getMF();
10128 const MachineRegisterInfo &MRI = MF->getRegInfo();
10130
10131 // See if this is Liverange split instruction inserted for SGPR or
10132 // wwm-register. The implicit def inserted for wwm-registers should also be
10133 // included as they can appear at the bb begin.
10134 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
10135 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
10136 return false;
10137
10138 Register Reg = MI.getOperand(0).getReg();
10139 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
10140 return IsLRSplitInst;
10141
10142 return MFI->isWWMReg(Reg);
10143}
10144
10146 Register Reg) const {
10147 // We need to handle instructions which may be inserted during register
10148 // allocation to handle the prolog. The initial prolog instruction may have
10149 // been separated from the start of the block by spills and copies inserted
10150 // needed by the prolog. However, the insertions for scalar registers can
10151 // always be placed at the BB top as they are independent of the exec mask
10152 // value.
10153 bool IsNullOrVectorRegister = true;
10154 if (Reg) {
10155 const MachineFunction *MF = MI.getMF();
10156 const MachineRegisterInfo &MRI = MF->getRegInfo();
10157 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
10158 }
10159
10160 return IsNullOrVectorRegister &&
10161 (canAddToBBProlog(MI) ||
10162 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10163 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
10164}
10165
10169 const DebugLoc &DL,
10170 Register DestReg) const {
10171 if (ST.hasAddNoCarryInsts())
10172 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
10173
10174 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10175 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
10176 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
10177
10178 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10179 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10180}
10181
10184 const DebugLoc &DL,
10185 Register DestReg,
10186 RegScavenger &RS) const {
10187 if (ST.hasAddNoCarryInsts())
10188 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
10189
10190 // If available, prefer to use vcc.
10191 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
10192 ? Register(RI.getVCC())
10193 : RS.scavengeRegisterBackwards(
10194 *RI.getBoolRC(), I, /* RestoreAfter */ false,
10195 0, /* AllowSpill */ false);
10196
10197 // TODO: Users need to deal with this.
10198 if (!UnusedCarry.isValid())
10199 return MachineInstrBuilder();
10200
10201 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10202 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10203}
10204
10205bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10206 switch (Opcode) {
10207 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10208 case AMDGPU::SI_KILL_I1_TERMINATOR:
10209 return true;
10210 default:
10211 return false;
10212 }
10213}
10214
10216 switch (Opcode) {
10217 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10218 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10219 case AMDGPU::SI_KILL_I1_PSEUDO:
10220 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10221 default:
10222 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10223 }
10224}
10225
10226bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10227 return Imm <= getMaxMUBUFImmOffset(ST);
10228}
10229
10231 // GFX12 field is non-negative 24-bit signed byte offset.
10232 const unsigned OffsetBits =
10233 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10234 return (1 << OffsetBits) - 1;
10235}
10236
10238 if (!ST.isWave32())
10239 return;
10240
10241 if (MI.isInlineAsm())
10242 return;
10243
10244 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10245 return;
10246
10247 for (auto &Op : MI.implicit_operands()) {
10248 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10249 Op.setReg(AMDGPU::VCC_LO);
10250 }
10251}
10252
10254 if (!isSMRD(MI))
10255 return false;
10256
10257 // Check that it is using a buffer resource.
10258 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10259 if (Idx == -1) // e.g. s_memtime
10260 return false;
10261
10262 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10263 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10264}
10265
10266// Given Imm, split it into the values to put into the SOffset and ImmOffset
10267// fields in an MUBUF instruction. Return false if it is not possible (due to a
10268// hardware bug needing a workaround).
10269//
10270// The required alignment ensures that individual address components remain
10271// aligned if they are aligned to begin with. It also ensures that additional
10272// offsets within the given alignment can be added to the resulting ImmOffset.
10274 uint32_t &ImmOffset, Align Alignment) const {
10275 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10276 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10277 uint32_t Overflow = 0;
10278
10279 if (Imm > MaxImm) {
10280 if (Imm <= MaxImm + 64) {
10281 // Use an SOffset inline constant for 4..64
10282 Overflow = Imm - MaxImm;
10283 Imm = MaxImm;
10284 } else {
10285 // Try to keep the same value in SOffset for adjacent loads, so that
10286 // the corresponding register contents can be re-used.
10287 //
10288 // Load values with all low-bits (except for alignment bits) set into
10289 // SOffset, so that a larger range of values can be covered using
10290 // s_movk_i32.
10291 //
10292 // Atomic operations fail to work correctly when individual address
10293 // components are unaligned, even if their sum is aligned.
10294 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10295 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10296 Imm = Low;
10297 Overflow = High - Alignment.value();
10298 }
10299 }
10300
10301 if (Overflow > 0) {
10302 // There is a hardware bug in SI and CI which prevents address clamping in
10303 // MUBUF instructions from working correctly with SOffsets. The immediate
10304 // offset is unaffected.
10305 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10306 return false;
10307
10308 // It is not possible to set immediate in SOffset field on some targets.
10309 if (ST.hasRestrictedSOffset())
10310 return false;
10311 }
10312
10313 ImmOffset = Imm;
10314 SOffset = Overflow;
10315 return true;
10316}
10317
10318// Depending on the used address space and instructions, some immediate offsets
10319// are allowed and some are not.
10320// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10321// scratch instruction offsets can also be negative. On GFX12, offsets can be
10322// negative for all variants.
10323//
10324// There are several bugs related to these offsets:
10325// On gfx10.1, flat instructions that go into the global address space cannot
10326// use an offset.
10327//
10328// For scratch instructions, the address can be either an SGPR or a VGPR.
10329// The following offsets can be used, depending on the architecture (x means
10330// cannot be used):
10331// +----------------------------+------+------+
10332// | Address-Mode | SGPR | VGPR |
10333// +----------------------------+------+------+
10334// | gfx9 | | |
10335// | negative, 4-aligned offset | x | ok |
10336// | negative, unaligned offset | x | ok |
10337// +----------------------------+------+------+
10338// | gfx10 | | |
10339// | negative, 4-aligned offset | ok | ok |
10340// | negative, unaligned offset | ok | x |
10341// +----------------------------+------+------+
10342// | gfx10.3 | | |
10343// | negative, 4-aligned offset | ok | ok |
10344// | negative, unaligned offset | ok | ok |
10345// +----------------------------+------+------+
10346//
10347// This function ignores the addressing mode, so if an offset cannot be used in
10348// one addressing mode, it is considered illegal.
10349bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10350 uint64_t FlatVariant) const {
10351 // TODO: Should 0 be special cased?
10352 if (!ST.hasFlatInstOffsets())
10353 return false;
10354
10355 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10356 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10357 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10358 return false;
10359
10360 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10361 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10362 (Offset % 4) != 0) {
10363 return false;
10364 }
10365
10366 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10367 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10368 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10369}
10370
10371// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10372std::pair<int64_t, int64_t>
10373SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10374 uint64_t FlatVariant) const {
10375 int64_t RemainderOffset = COffsetVal;
10376 int64_t ImmField = 0;
10377
10378 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10379 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10380
10381 if (AllowNegative) {
10382 // Use signed division by a power of two to truncate towards 0.
10383 int64_t D = 1LL << NumBits;
10384 RemainderOffset = (COffsetVal / D) * D;
10385 ImmField = COffsetVal - RemainderOffset;
10386
10387 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10388 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10389 (ImmField % 4) != 0) {
10390 // Make ImmField a multiple of 4
10391 RemainderOffset += ImmField % 4;
10392 ImmField -= ImmField % 4;
10393 }
10394 } else if (COffsetVal >= 0) {
10395 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10396 RemainderOffset = COffsetVal - ImmField;
10397 }
10398
10399 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10400 assert(RemainderOffset + ImmField == COffsetVal);
10401 return {ImmField, RemainderOffset};
10402}
10403
10405 if (ST.hasNegativeScratchOffsetBug() &&
10406 FlatVariant == SIInstrFlags::FlatScratch)
10407 return false;
10408
10409 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10410}
10411
10412static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10413 switch (ST.getGeneration()) {
10414 default:
10415 break;
10418 return SIEncodingFamily::SI;
10421 return SIEncodingFamily::VI;
10425 return ST.hasGFX11_7Insts() ? SIEncodingFamily::GFX1170
10428 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10432 }
10433 llvm_unreachable("Unknown subtarget generation!");
10434}
10435
10436bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10437 switch(MCOp) {
10438 // These opcodes use indirect register addressing so
10439 // they need special handling by codegen (currently missing).
10440 // Therefore it is too risky to allow these opcodes
10441 // to be selected by dpp combiner or sdwa peepholer.
10442 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10443 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10444 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10445 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10446 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10447 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10448 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10449 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10450 return true;
10451 default:
10452 return false;
10453 }
10454}
10455
10456#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10457 case OPCODE##_dpp: \
10458 case OPCODE##_e32: \
10459 case OPCODE##_e64: \
10460 case OPCODE##_e64_dpp: \
10461 case OPCODE##_sdwa:
10462
10463static bool isRenamedInGFX9(int Opcode) {
10464 switch (Opcode) {
10465 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10466 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10467 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10468 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10469 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10470 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10471 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10472 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10473 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10474 //
10475 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10476 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10477 case AMDGPU::V_FMA_F16_gfx9_e64:
10478 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10479 case AMDGPU::V_INTERP_P2_F16:
10480 case AMDGPU::V_MAD_F16_e64:
10481 case AMDGPU::V_MAD_U16_e64:
10482 case AMDGPU::V_MAD_I16_e64:
10483 return true;
10484 default:
10485 return false;
10486 }
10487}
10488
10489int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10490 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10491 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10492
10493 unsigned Gen = subtargetEncodingFamily(ST);
10494
10495 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10497
10498 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10499 // subtarget has UnpackedD16VMem feature.
10500 // TODO: remove this when we discard GFX80 encoding.
10501 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10503
10504 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10505 switch (ST.getGeneration()) {
10506 default:
10508 break;
10511 break;
10514 break;
10515 }
10516 }
10517
10518 if (isMAI(Opcode)) {
10519 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10520 if (MFMAOp != -1)
10521 Opcode = MFMAOp;
10522 }
10523
10524 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10525
10526 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX11_7Insts())
10528
10529 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10531
10532 // -1 means that Opcode is already a native instruction.
10533 if (MCOp == -1)
10534 return Opcode;
10535
10536 if (ST.hasGFX90AInsts()) {
10537 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10538 if (ST.hasGFX940Insts())
10540 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10542 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10544 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10545 MCOp = NMCOp;
10546 }
10547
10548 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10549 // encoding in the given subtarget generation.
10550 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10551 return -1;
10552
10553 if (isAsmOnlyOpcode(MCOp))
10554 return -1;
10555
10556 return MCOp;
10557}
10558
10559static
10561 assert(RegOpnd.isReg());
10562 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10563 getRegSubRegPair(RegOpnd);
10564}
10565
10568 assert(MI.isRegSequence());
10569 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10570 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10571 auto &RegOp = MI.getOperand(1 + 2 * I);
10572 return getRegOrUndef(RegOp);
10573 }
10575}
10576
10577// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10578// Following a subreg of reg:subreg isn't supported
10581 if (!RSR.SubReg)
10582 return false;
10583 switch (MI.getOpcode()) {
10584 default: break;
10585 case AMDGPU::REG_SEQUENCE:
10586 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10587 return true;
10588 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10589 case AMDGPU::INSERT_SUBREG:
10590 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10591 // inserted the subreg we're looking for
10592 RSR = getRegOrUndef(MI.getOperand(2));
10593 else { // the subreg in the rest of the reg
10594 auto R1 = getRegOrUndef(MI.getOperand(1));
10595 if (R1.SubReg) // subreg of subreg isn't supported
10596 return false;
10597 RSR.Reg = R1.Reg;
10598 }
10599 return true;
10600 }
10601 return false;
10602}
10603
10605 const MachineRegisterInfo &MRI) {
10606 assert(MRI.isSSA());
10607 if (!P.Reg.isVirtual())
10608 return nullptr;
10609
10610 auto RSR = P;
10611 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10612 while (auto *MI = DefInst) {
10613 DefInst = nullptr;
10614 switch (MI->getOpcode()) {
10615 case AMDGPU::COPY:
10616 case AMDGPU::V_MOV_B32_e32: {
10617 auto &Op1 = MI->getOperand(1);
10618 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10619 if (Op1.isUndef())
10620 return nullptr;
10621 RSR = getRegSubRegPair(Op1);
10622 DefInst = MRI.getVRegDef(RSR.Reg);
10623 }
10624 break;
10625 }
10626 default:
10627 if (followSubRegDef(*MI, RSR)) {
10628 if (!RSR.Reg)
10629 return nullptr;
10630 DefInst = MRI.getVRegDef(RSR.Reg);
10631 }
10632 }
10633 if (!DefInst)
10634 return MI;
10635 }
10636 return nullptr;
10637}
10638
10640 Register VReg,
10641 const MachineInstr &DefMI,
10642 const MachineInstr &UseMI) {
10643 assert(MRI.isSSA() && "Must be run on SSA");
10644
10645 auto *TRI = MRI.getTargetRegisterInfo();
10646 auto *DefBB = DefMI.getParent();
10647
10648 // Don't bother searching between blocks, although it is possible this block
10649 // doesn't modify exec.
10650 if (UseMI.getParent() != DefBB)
10651 return true;
10652
10653 const int MaxInstScan = 20;
10654 int NumInst = 0;
10655
10656 // Stop scan at the use.
10657 auto E = UseMI.getIterator();
10658 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10659 if (I->isDebugInstr())
10660 continue;
10661
10662 if (++NumInst > MaxInstScan)
10663 return true;
10664
10665 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10666 return true;
10667 }
10668
10669 return false;
10670}
10671
10673 Register VReg,
10674 const MachineInstr &DefMI) {
10675 assert(MRI.isSSA() && "Must be run on SSA");
10676
10677 auto *TRI = MRI.getTargetRegisterInfo();
10678 auto *DefBB = DefMI.getParent();
10679
10680 const int MaxUseScan = 10;
10681 int NumUse = 0;
10682
10683 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10684 auto &UseInst = *Use.getParent();
10685 // Don't bother searching between blocks, although it is possible this block
10686 // doesn't modify exec.
10687 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10688 return true;
10689
10690 if (++NumUse > MaxUseScan)
10691 return true;
10692 }
10693
10694 if (NumUse == 0)
10695 return false;
10696
10697 const int MaxInstScan = 20;
10698 int NumInst = 0;
10699
10700 // Stop scan when we have seen all the uses.
10701 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10702 assert(I != DefBB->end());
10703
10704 if (I->isDebugInstr())
10705 continue;
10706
10707 if (++NumInst > MaxInstScan)
10708 return true;
10709
10710 for (const MachineOperand &Op : I->operands()) {
10711 // We don't check reg masks here as they're used only on calls:
10712 // 1. EXEC is only considered const within one BB
10713 // 2. Call should be a terminator instruction if present in a BB
10714
10715 if (!Op.isReg())
10716 continue;
10717
10718 Register Reg = Op.getReg();
10719 if (Op.isUse()) {
10720 if (Reg == VReg && --NumUse == 0)
10721 return false;
10722 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10723 return true;
10724 }
10725 }
10726}
10727
10730 const DebugLoc &DL, Register Src, Register Dst) const {
10731 auto Cur = MBB.begin();
10732 if (Cur != MBB.end())
10733 do {
10734 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10735 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10736 ++Cur;
10737 } while (Cur != MBB.end() && Cur != LastPHIIt);
10738
10739 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10740 Dst);
10741}
10742
10745 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10746 if (InsPt != MBB.end() &&
10747 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10748 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10749 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10750 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10751 InsPt++;
10752 return BuildMI(MBB, InsPt, DL,
10753 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10754 .addReg(Src, {}, SrcSubReg)
10755 .addReg(AMDGPU::EXEC, RegState::Implicit);
10756 }
10757 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10758 Dst);
10759}
10760
10761bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10762
10764 const MachineInstr &SecondMI) const {
10765 for (const auto &Use : SecondMI.all_uses()) {
10766 if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), &RI))
10767 return true;
10768 }
10769 return false;
10770}
10771
10772/// If OpX is multicycle, anti-dependencies are not allowed.
10773/// isDPMACCInstruction was not designed for VOPD, but it is fit for the
10774/// purpose.
10776 const MachineInstr &OpX) const {
10778}
10779
10782 ArrayRef<unsigned> Ops, int FrameIndex,
10783 MachineInstr *&CopyMI, LiveIntervals *LIS,
10784 VirtRegMap *VRM) const {
10785 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10786 //
10787 // %0:sreg_32 = COPY $m0
10788 //
10789 // We explicitly chose SReg_32 for the virtual register so such a copy might
10790 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10791 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10792 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10793 // TargetInstrInfo::foldMemoryOperand() is going to try.
10794 // A similar issue also exists with spilling and reloading $exec registers.
10795 //
10796 // To prevent that, constrain the %0 register class here.
10797 if (isFullCopyInstr(MI)) {
10798 Register DstReg = MI.getOperand(0).getReg();
10799 Register SrcReg = MI.getOperand(1).getReg();
10800 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10801 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10802 MachineRegisterInfo &MRI = MF.getRegInfo();
10803 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10804 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10805 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10806 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10807 return nullptr;
10808 }
10809 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10810 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10811 return nullptr;
10812 }
10813 }
10814 }
10815
10816 return nullptr;
10817}
10818
10820 const MachineInstr &MI,
10821 unsigned *PredCost) const {
10822 if (MI.isBundle()) {
10824 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10825 unsigned Lat = 0, Count = 0;
10826 for (++I; I != E && I->isBundledWithPred(); ++I) {
10827 ++Count;
10828 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10829 }
10830 return Lat + Count - 1;
10831 }
10832
10833 return SchedModel.computeInstrLatency(&MI);
10834}
10835
10836const MachineOperand &
10838 if (const MachineOperand *CallAddrOp =
10839 getNamedOperand(MI, AMDGPU::OpName::src0))
10840 return *CallAddrOp;
10842}
10843
10846 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10847 unsigned Opcode = MI.getOpcode();
10848
10849 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10850 Register Dst = MI.getOperand(0).getReg();
10851 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10852 : MI.getOperand(1).getReg();
10853 LLT DstTy = MRI.getType(Dst);
10854 LLT SrcTy = MRI.getType(Src);
10855 unsigned DstAS = DstTy.getAddressSpace();
10856 unsigned SrcAS = SrcTy.getAddressSpace();
10857 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10858 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10859 ST.hasGloballyAddressableScratch()
10862 };
10863
10864 // If the target supports globally addressable scratch, the mapping from
10865 // scratch memory to the flat aperture changes therefore an address space cast
10866 // is no longer uniform.
10867 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10868 return HandleAddrSpaceCast(MI);
10869
10870 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10871 auto IID = GI->getIntrinsicID();
10876
10877 switch (IID) {
10878 case Intrinsic::amdgcn_addrspacecast_nonnull:
10879 return HandleAddrSpaceCast(MI);
10880 case Intrinsic::amdgcn_if:
10881 case Intrinsic::amdgcn_else:
10882 // FIXME: Uniform if second result
10883 break;
10884 }
10885
10887 }
10888
10889 // Loads from the private and flat address spaces are divergent, because
10890 // threads can execute the load instruction with the same inputs and get
10891 // different results.
10892 //
10893 // All other loads are not divergent, because if threads issue loads with the
10894 // same arguments, they will always get the same result.
10895 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10896 Opcode == AMDGPU::G_SEXTLOAD) {
10897 if (MI.memoperands_empty())
10898 return ValueUniformity::NeverUniform; // conservative assumption
10899
10900 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10901 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10902 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10903 })) {
10904 // At least one MMO in a non-global address space.
10906 }
10908 }
10909
10910 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10911 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10912 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10913 AMDGPU::isGenericAtomic(Opcode)) {
10915 }
10917}
10918
10920 if (!Formatter)
10921 Formatter = std::make_unique<AMDGPUMIRFormatter>(ST);
10922 return Formatter.get();
10923}
10924
10926
10927 if (isNeverUniform(MI))
10929
10930 unsigned opcode = MI.getOpcode();
10931 if (opcode == AMDGPU::V_READLANE_B32 ||
10932 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10933 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10935
10936 if (isCopyInstr(MI)) {
10937 const MachineOperand &srcOp = MI.getOperand(1);
10938 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10939 const TargetRegisterClass *regClass =
10940 RI.getPhysRegBaseClass(srcOp.getReg());
10941 return RI.isSGPRClass(regClass) ? ValueUniformity::AlwaysUniform
10943 }
10945 }
10946
10947 // GMIR handling
10948 if (MI.isPreISelOpcode())
10950
10951 // Atomics are divergent because they are executed sequentially: when an
10952 // atomic operation refers to the same address in each thread, then each
10953 // thread after the first sees the value written by the previous thread as
10954 // original value.
10955
10956 if (isAtomic(MI))
10958
10959 // Loads from the private and flat address spaces are divergent, because
10960 // threads can execute the load instruction with the same inputs and get
10961 // different results.
10962 if (isFLAT(MI) && MI.mayLoad()) {
10963 if (MI.memoperands_empty())
10964 return ValueUniformity::NeverUniform; // conservative assumption
10965
10966 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10967 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10968 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10969 })) {
10970 // At least one MMO in a non-global address space.
10972 }
10973
10975 }
10976
10977 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10978 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10979
10980 // FIXME: It's conceptually broken to report this for an instruction, and not
10981 // a specific def operand. For inline asm in particular, there could be mixed
10982 // uniform and divergent results.
10983 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10984 const MachineOperand &SrcOp = MI.getOperand(I);
10985 if (!SrcOp.isReg())
10986 continue;
10987
10988 Register Reg = SrcOp.getReg();
10989 if (!Reg || !SrcOp.readsReg())
10990 continue;
10991
10992 // If RegBank is null, this is unassigned or an unallocatable special
10993 // register, which are all scalars.
10994 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10995 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10997 }
10998
10999 // TODO: Uniformity check condtions above can be rearranged for more
11000 // redability
11001
11002 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
11003 // currently turned into no-op COPYs by SelectionDAG ISel and are
11004 // therefore no longer recognizable.
11005
11007}
11008
11010 switch (MF.getFunction().getCallingConv()) {
11012 return 1;
11014 return 2;
11016 return 3;
11020 const Function &F = MF.getFunction();
11021 F.getContext().diagnose(DiagnosticInfoUnsupported(
11022 F, "ds_ordered_count unsupported for this calling conv"));
11023 [[fallthrough]];
11024 }
11027 case CallingConv::C:
11028 case CallingConv::Fast:
11029 default:
11030 // Assume other calling conventions are various compute callable functions
11031 return 0;
11032 }
11033}
11034
11036 Register &SrcReg2, int64_t &CmpMask,
11037 int64_t &CmpValue) const {
11038 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
11039 return false;
11040
11041 switch (MI.getOpcode()) {
11042 default:
11043 break;
11044 case AMDGPU::S_CMP_EQ_U32:
11045 case AMDGPU::S_CMP_EQ_I32:
11046 case AMDGPU::S_CMP_LG_U32:
11047 case AMDGPU::S_CMP_LG_I32:
11048 case AMDGPU::S_CMP_LT_U32:
11049 case AMDGPU::S_CMP_LT_I32:
11050 case AMDGPU::S_CMP_GT_U32:
11051 case AMDGPU::S_CMP_GT_I32:
11052 case AMDGPU::S_CMP_LE_U32:
11053 case AMDGPU::S_CMP_LE_I32:
11054 case AMDGPU::S_CMP_GE_U32:
11055 case AMDGPU::S_CMP_GE_I32:
11056 case AMDGPU::S_CMP_EQ_U64:
11057 case AMDGPU::S_CMP_LG_U64:
11058 SrcReg = MI.getOperand(0).getReg();
11059 if (MI.getOperand(1).isReg()) {
11060 if (MI.getOperand(1).getSubReg())
11061 return false;
11062 SrcReg2 = MI.getOperand(1).getReg();
11063 CmpValue = 0;
11064 } else if (MI.getOperand(1).isImm()) {
11065 SrcReg2 = Register();
11066 CmpValue = MI.getOperand(1).getImm();
11067 } else {
11068 return false;
11069 }
11070 CmpMask = ~0;
11071 return true;
11072 case AMDGPU::S_CMPK_EQ_U32:
11073 case AMDGPU::S_CMPK_EQ_I32:
11074 case AMDGPU::S_CMPK_LG_U32:
11075 case AMDGPU::S_CMPK_LG_I32:
11076 case AMDGPU::S_CMPK_LT_U32:
11077 case AMDGPU::S_CMPK_LT_I32:
11078 case AMDGPU::S_CMPK_GT_U32:
11079 case AMDGPU::S_CMPK_GT_I32:
11080 case AMDGPU::S_CMPK_LE_U32:
11081 case AMDGPU::S_CMPK_LE_I32:
11082 case AMDGPU::S_CMPK_GE_U32:
11083 case AMDGPU::S_CMPK_GE_I32:
11084 SrcReg = MI.getOperand(0).getReg();
11085 SrcReg2 = Register();
11086 CmpValue = MI.getOperand(1).getImm();
11087 CmpMask = ~0;
11088 return true;
11089 }
11090
11091 return false;
11092}
11093
11095 for (MachineBasicBlock *S : MBB->successors()) {
11096 if (S->isLiveIn(AMDGPU::SCC))
11097 return false;
11098 }
11099 return true;
11100}
11101
11102// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
11103// (incoming SCC) = !(SCC defined by SCCDef).
11104// Return true if all uses can be re-written, false otherwise.
11105bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
11106 MachineBasicBlock *MBB = SCCDef->getParent();
11107 SmallVector<MachineInstr *> InvertInstr;
11108 bool SCCIsDead = false;
11109
11110 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
11111 constexpr unsigned ScanLimit = 12;
11112 unsigned Count = 0;
11113 for (MachineInstr &MI :
11114 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
11115 if (++Count > ScanLimit)
11116 return false;
11117 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
11118 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
11119 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
11120 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11121 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
11122 InvertInstr.push_back(&MI);
11123 else
11124 return false;
11125 }
11126 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
11127 SCCIsDead = true;
11128 break;
11129 }
11130 }
11131 if (!SCCIsDead && isSCCDeadOnExit(MBB))
11132 SCCIsDead = true;
11133
11134 // SCC may have more uses. Can't invert all of them.
11135 if (!SCCIsDead)
11136 return false;
11137
11138 // Invert uses
11139 for (MachineInstr *MI : InvertInstr) {
11140 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
11141 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
11142 swapOperands(*MI);
11143 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11144 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
11145 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
11146 ? AMDGPU::S_CBRANCH_SCC1
11147 : AMDGPU::S_CBRANCH_SCC0));
11148 } else {
11149 llvm_unreachable("SCC used but no inversion handling");
11150 }
11151 }
11152 return true;
11153}
11154
11155// SCC is already valid after SCCValid.
11156// SCCRedefine will redefine SCC to the same value already available after
11157// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
11158// update kill/dead flags if necessary.
11159bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
11160 bool NeedInversion) const {
11161 MachineInstr *KillsSCC = nullptr;
11162 if (SCCValid->getParent() != SCCRedefine->getParent())
11163 return false;
11164 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
11165 SCCRedefine->getIterator())) {
11166 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
11167 return false;
11168 if (MI.killsRegister(AMDGPU::SCC, &RI))
11169 KillsSCC = &MI;
11170 }
11171 if (NeedInversion && !invertSCCUse(SCCRedefine))
11172 return false;
11173 if (MachineOperand *SccDef =
11174 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
11175 SccDef->setIsDead(false);
11176 if (KillsSCC)
11177 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
11178 SCCRedefine->eraseFromParent();
11179 return true;
11180}
11181
11182static bool foldableSelect(const MachineInstr &Def) {
11183 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11184 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11185 return false;
11186 bool Op1IsNonZeroImm =
11187 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
11188 bool Op2IsZeroImm =
11189 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
11190 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11191 return false;
11192 return true;
11193}
11194
11195static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11196 unsigned &NewDefOpc) {
11197 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11198 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11199 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11200 Def.getOpcode() != AMDGPU::S_ADD_U32)
11201 return false;
11202 const MachineOperand &AddSrc1 = Def.getOperand(1);
11203 const MachineOperand &AddSrc2 = Def.getOperand(2);
11204 int64_t addend;
11205
11206 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11207 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11208 (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
11209 (!getFoldableImm(&AddSrc2, addend) || addend != 1))
11210 return false;
11211
11212 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11213 const MachineOperand *SccDef =
11214 Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
11215 if (!SccDef->isDead())
11216 return false;
11217 NewDefOpc = AMDGPU::S_ADD_U32;
11218 }
11219 NeedInversion = !NeedInversion;
11220 return true;
11221}
11222
11224 Register SrcReg2, int64_t CmpMask,
11225 int64_t CmpValue,
11226 const MachineRegisterInfo *MRI) const {
11227 if (!SrcReg || SrcReg.isPhysical())
11228 return false;
11229
11230 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
11231 return false;
11232
11233 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11234 this](bool NeedInversion) -> bool {
11235 if (CmpValue != 0)
11236 return false;
11237
11238 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11239 if (!Def)
11240 return false;
11241
11242 // For S_OP that set SCC = DST!=0, do the transformation
11243 //
11244 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11245 //
11246 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11247 // do the transformation:
11248 //
11249 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11250 //
11251 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11252 // for S_CSELECT* already has the same value that will be calculated by
11253 // s_cmp_lg_*
11254 //
11255 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11256 // (non-zero imm), 0)
11257
11258 unsigned NewDefOpc = Def->getOpcode();
11259 if (!setsSCCIfResultIsNonZero(*Def) &&
11260 !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
11261 !foldableSelect(*Def))
11262 return false;
11263
11264 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
11265 return false;
11266
11267 if (NewDefOpc != Def->getOpcode())
11268 Def->setDesc(get(NewDefOpc));
11269
11270 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11271 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11272 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11273 // sX = s_cselect_b64 (non-zero imm), 0
11274 // sLo = copy sX.sub0
11275 // sHi = copy sX.sub1
11276 // sY = s_or_b32 sLo, sHi
11277 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11278 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11279 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11280 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11281 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11282 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11283 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11284 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11285 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11286 Def2->getOperand(1).isReg() &&
11287 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11288 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11289 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11290 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11291 if (Select && foldableSelect(*Select))
11292 optimizeSCC(Select, Def, /*NeedInversion=*/false);
11293 }
11294 }
11295 }
11296 return true;
11297 };
11298
11299 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11300 this](int64_t ExpectedValue, unsigned SrcSize,
11301 bool IsReversible, bool IsSigned) -> bool {
11302 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11303 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11304 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11305 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11306 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11307 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11308 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11309 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11310 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11311 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11312 //
11313 // Signed ge/gt are not used for the sign bit.
11314 //
11315 // If result of the AND is unused except in the compare:
11316 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11317 //
11318 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11319 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11320 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11321 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11322 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11323 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11324
11325 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11326 if (!Def)
11327 return false;
11328
11329 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11330 Def->getOpcode() != AMDGPU::S_AND_B64)
11331 return false;
11332
11333 int64_t Mask;
11334 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11335 if (MO->isImm())
11336 Mask = MO->getImm();
11337 else if (!getFoldableImm(MO, Mask))
11338 return false;
11339 Mask &= maxUIntN(SrcSize);
11340 return isPowerOf2_64(Mask);
11341 };
11342
11343 MachineOperand *SrcOp = &Def->getOperand(1);
11344 if (isMask(SrcOp))
11345 SrcOp = &Def->getOperand(2);
11346 else if (isMask(&Def->getOperand(2)))
11347 SrcOp = &Def->getOperand(1);
11348 else
11349 return false;
11350
11351 // A valid Mask is required to have a single bit set, hence a non-zero and
11352 // power-of-two value. This verifies that we will not do 64-bit shift below.
11353 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11354 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11355 if (IsSigned && BitNo == SrcSize - 1)
11356 return false;
11357
11358 ExpectedValue <<= BitNo;
11359
11360 bool IsReversedCC = false;
11361 if (CmpValue != ExpectedValue) {
11362 if (!IsReversible)
11363 return false;
11364 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11365 if (!IsReversedCC)
11366 return false;
11367 }
11368
11369 Register DefReg = Def->getOperand(0).getReg();
11370 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11371 return false;
11372
11373 if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
11374 return false;
11375
11376 if (!MRI->use_nodbg_empty(DefReg)) {
11377 assert(!IsReversedCC);
11378 return true;
11379 }
11380
11381 // Replace AND with unused result with a S_BITCMP.
11382 MachineBasicBlock *MBB = Def->getParent();
11383
11384 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11385 : AMDGPU::S_BITCMP1_B32
11386 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11387 : AMDGPU::S_BITCMP1_B64;
11388
11389 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11390 .add(*SrcOp)
11391 .addImm(BitNo);
11392 Def->eraseFromParent();
11393
11394 return true;
11395 };
11396
11397 switch (CmpInstr.getOpcode()) {
11398 default:
11399 break;
11400 case AMDGPU::S_CMP_EQ_U32:
11401 case AMDGPU::S_CMP_EQ_I32:
11402 case AMDGPU::S_CMPK_EQ_U32:
11403 case AMDGPU::S_CMPK_EQ_I32:
11404 return optimizeCmpAnd(1, 32, true, false) ||
11405 optimizeCmpSelect(/*NeedInversion=*/true);
11406 case AMDGPU::S_CMP_GE_U32:
11407 case AMDGPU::S_CMPK_GE_U32:
11408 return optimizeCmpAnd(1, 32, false, false);
11409 case AMDGPU::S_CMP_GE_I32:
11410 case AMDGPU::S_CMPK_GE_I32:
11411 return optimizeCmpAnd(1, 32, false, true);
11412 case AMDGPU::S_CMP_EQ_U64:
11413 return optimizeCmpAnd(1, 64, true, false);
11414 case AMDGPU::S_CMP_LG_U32:
11415 case AMDGPU::S_CMP_LG_I32:
11416 case AMDGPU::S_CMPK_LG_U32:
11417 case AMDGPU::S_CMPK_LG_I32:
11418 return optimizeCmpAnd(0, 32, true, false) ||
11419 optimizeCmpSelect(/*NeedInversion=*/false);
11420 case AMDGPU::S_CMP_GT_U32:
11421 case AMDGPU::S_CMPK_GT_U32:
11422 return optimizeCmpAnd(0, 32, false, false);
11423 case AMDGPU::S_CMP_GT_I32:
11424 case AMDGPU::S_CMPK_GT_I32:
11425 return optimizeCmpAnd(0, 32, false, true);
11426 case AMDGPU::S_CMP_LG_U64:
11427 return optimizeCmpAnd(0, 64, true, false) ||
11428 optimizeCmpSelect(/*NeedInversion=*/false);
11429 }
11430
11431 return false;
11432}
11433
11435 AMDGPU::OpName OpName) const {
11436 if (!ST.needsAlignedVGPRs())
11437 return;
11438
11439 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11440 if (OpNo < 0)
11441 return;
11442 MachineOperand &Op = MI.getOperand(OpNo);
11443 if (getOpSize(MI, OpNo) > 4)
11444 return;
11445
11446 // Add implicit aligned super-reg to force alignment on the data operand.
11447 const DebugLoc &DL = MI.getDebugLoc();
11448 MachineBasicBlock *BB = MI.getParent();
11450 Register DataReg = Op.getReg();
11451 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11453 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11454 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11455 Register NewVR =
11456 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11457 : &AMDGPU::VReg_64_Align2RegClass);
11458 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11459 .addReg(DataReg, {}, Op.getSubReg())
11460 .addImm(AMDGPU::sub0)
11461 .addReg(Undef)
11462 .addImm(AMDGPU::sub1);
11463 Op.setReg(NewVR);
11464 Op.setSubReg(AMDGPU::sub0);
11465 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11466}
11467
11469 if (isIGLP(*MI))
11470 return false;
11471
11473}
11474
11476 if (!isWMMA(MI) && !isSWMMAC(MI))
11477 return false;
11478
11479 if (ST.hasGFX1250Insts())
11480 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11481
11482 return true;
11483}
11484
11486 unsigned Opcode = MI.getOpcode();
11487
11488 if (AMDGPU::isGFX12Plus(ST))
11489 return isDOT(MI) || isXDLWMMA(MI);
11490
11491 if (!isMAI(MI) || isDGEMM(Opcode) ||
11492 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11493 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11494 return false;
11495
11496 if (!ST.hasGFX940Insts())
11497 return true;
11498
11499 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11500}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={})
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static MachineBasicBlock * generateWaterFallLoop(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr, ArrayRef< Register > PhySGPRs={})
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion, unsigned &NewDefOpc)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
#define LLVM_DEBUG(...)
Definition Debug.h:119
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:145
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:123
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:254
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
MIRFormater - Interface to format MIR operand based on target.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI void moveOperands(MachineOperand *Dst, MachineOperand *Src, unsigned NumOps)
Move NumOps operands from Src to Dst, updating use-def lists as needed.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
LLVM_ABI void clearVirtRegs()
clearVirtRegs - Remove all virtual registers (after physreg assignment).
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
iterator_range< use_iterator > use_operands(Register Reg) const
LLVM_ABI void removeRegOperandFromUseList(MachineOperand *MO)
Remove MO from its use-def list.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
LLVM_ABI void addRegOperandToUseList(MachineOperand *MO)
Add MO to the linked list of operands for its register.
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
bool isSpill(uint32_t Opcode) const
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool setsSCCIfResultIsNonZero(const MachineInstr &MI)
const MIRFormatter * getMIRFormatter() const override
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
InstSizeVerifyMode getInstSizeVerifyMode(const MachineInstr &MI) const override
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
Register isStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool hasRAWDependency(const MachineInstr &FirstMI, const MachineInstr &SecondMI) const
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
void handleCopyToPhysHelper(SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst, MachineRegisterInfo &MRI, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool isVOPDAntidependencyAllowed(const MachineInstr &MI) const
If OpX is multicycle, anti-dependencies are not allowed.
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void createWaterFallForSiCall(MachineInstr *MI, MachineDominatorTree *MDT, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={}) const
Wrapper function for generating waterfall for instruction MI This function take into consideration of...
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
ValueUniformity getGenericValueUniformity(const MachineInstr &MI) const
static bool isMAI(const MCInstrDesc &Desc)
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const override
static bool usesLGKM_CNT(const MachineInstr &MI)
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
bool isAlwaysGDS(uint32_t Opcode) const
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
void createReadFirstLaneFromCopyToPhysReg(MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool isWWMRegSpillOpcode(uint32_t Opcode)
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
ValueUniformity getValueUniformity(const MachineInstr &MI) const final
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
std::optional< int64_t > getImmOrMaterializedImm(MachineOperand &Op) const
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
unsigned getScratchReservedForDynamicVGPRs() const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo & getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isDPMACCInstruction(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int32_t getCommuteRev(uint32_t Opcode)
LLVM_READONLY int32_t getCommuteOrig(uint32_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READONLY int32_t getGlobalVaddrOp(uint32_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getIfAddr64Inst(uint32_t Opcode)
Check if Opcode is an Addr64 opcode.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
LLVM_READONLY int32_t getAddr64Inst(uint32_t Opcode)
int32_t getMCOpcode(uint32_t Opcode, unsigned Gen)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:204
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:227
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:213
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:203
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:209
@ OPERAND_REG_IMM_V2FP16_SPLAT
Definition SIDefines.h:212
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:219
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:214
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:228
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:240
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:215
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:251
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:206
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:226
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:245
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:216
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:241
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:223
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:205
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:231
LLVM_READONLY int32_t getBasicFromSDWAOp(uint32_t Opcode)
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:614
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:616
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:613
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:615
@ TI_CONSTDATA_START
Definition AMDGPU.h:612
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:558
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
constexpr RegState getUndefRegState(bool B)
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
MachineCycleInfo::CycleT MachineCycle
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:63
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
constexpr bool all() const
Definition LaneBitmask.h:54
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:67
MachineInstr * top() const
Definition SIInstrInfo.h:72
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:91
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.