LLVM 19.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm {
44namespace AMDGPU {
45#define GET_D16ImageDimIntrinsics_IMPL
46#define GET_ImageDimIntrinsicTable_IMPL
47#define GET_RsrcIntrinsics_IMPL
48#include "AMDGPUGenSearchableTables.inc"
49}
50}
51
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
85static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
86 unsigned Opc0 = N0->getMachineOpcode();
87 unsigned Opc1 = N1->getMachineOpcode();
88
89 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
90 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
91
92 if (Op0Idx == -1 && Op1Idx == -1)
93 return true;
94
95
96 if ((Op0Idx == -1 && Op1Idx != -1) ||
97 (Op1Idx == -1 && Op0Idx != -1))
98 return false;
99
100 // getNamedOperandIdx returns the index for the MachineInstr's operands,
101 // which includes the result as the first operand. We are indexing into the
102 // MachineSDNode's operands, so we need to skip the result operand to get
103 // the real index.
104 --Op0Idx;
105 --Op1Idx;
106
107 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
108}
109
110static bool canRemat(const MachineInstr &MI) {
111
115 return true;
116
117 if (SIInstrInfo::isSMRD(MI)) {
118 return !MI.memoperands_empty() &&
119 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
120 return MMO->isLoad() && MMO->isInvariant();
121 });
122 }
123
124 return false;
125}
126
128 const MachineInstr &MI) const {
129
130 if (canRemat(MI)) {
131 // Normally VALU use of exec would block the rematerialization, but that
132 // is OK in this case to have an implicit exec read as all VALU do.
133 // We really want all of the generic logic for this except for this.
134
135 // Another potential implicit use is mode register. The core logic of
136 // the RA will not attempt rematerialization if mode is set anywhere
137 // in the function, otherwise it is safe since mode is not changed.
138
139 // There is difference to generic method which does not allow
140 // rematerialization if there are virtual register uses. We allow this,
141 // therefore this method includes SOP instructions as well.
142 if (!MI.hasImplicitDef() &&
143 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
144 !MI.mayRaiseFPException())
145 return true;
146 }
147
149}
150
151// Returns true if the scalar result of a VALU instruction depends on exec.
153 // Ignore comparisons which are only used masked with exec.
154 // This allows some hoisting/sinking of VALU comparisons.
155 if (MI.isCompare()) {
156 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
157 Register DstReg = MI.getOperand(0).getReg();
158 if (!DstReg.isVirtual())
159 return true;
160 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
161 switch (Use.getOpcode()) {
162 case AMDGPU::S_AND_SAVEEXEC_B32:
163 case AMDGPU::S_AND_SAVEEXEC_B64:
164 break;
165 case AMDGPU::S_AND_B32:
166 case AMDGPU::S_AND_B64:
167 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
168 return true;
169 break;
170 default:
171 return true;
172 }
173 }
174 return false;
175 }
176
177 switch (MI.getOpcode()) {
178 default:
179 break;
180 case AMDGPU::V_READFIRSTLANE_B32:
181 return true;
182 }
183
184 return false;
185}
186
188 // Any implicit use of exec by VALU is not a real register read.
189 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
191}
192
194 MachineBasicBlock *SuccToSinkTo,
195 MachineCycleInfo *CI) const {
196 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
197 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
198 return true;
199
200 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
201 // Check if sinking of MI would create temporal divergent use.
202 for (auto Op : MI.uses()) {
203 if (Op.isReg() && Op.getReg().isVirtual() &&
204 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
205 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
206
207 // SgprDef defined inside cycle
208 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
209 if (FromCycle == nullptr)
210 continue;
211
212 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
213 // Check if there is a FromCycle that contains SgprDef's basic block but
214 // does not contain SuccToSinkTo and also has divergent exit condition.
215 while (FromCycle && !FromCycle->contains(ToCycle)) {
217 FromCycle->getExitingBlocks(ExitingBlocks);
218
219 // FromCycle has divergent exit condition.
220 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
221 if (hasDivergentBranch(ExitingBlock))
222 return false;
223 }
224
225 FromCycle = FromCycle->getParentCycle();
226 }
227 }
228 }
229
230 return true;
231}
232
234 int64_t &Offset0,
235 int64_t &Offset1) const {
236 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
237 return false;
238
239 unsigned Opc0 = Load0->getMachineOpcode();
240 unsigned Opc1 = Load1->getMachineOpcode();
241
242 // Make sure both are actually loads.
243 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
244 return false;
245
246 // A mayLoad instruction without a def is not a load. Likely a prefetch.
247 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
248 return false;
249
250 if (isDS(Opc0) && isDS(Opc1)) {
251
252 // FIXME: Handle this case:
253 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
254 return false;
255
256 // Check base reg.
257 if (Load0->getOperand(0) != Load1->getOperand(0))
258 return false;
259
260 // Skip read2 / write2 variants for simplicity.
261 // TODO: We should report true if the used offsets are adjacent (excluded
262 // st64 versions).
263 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
264 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
265 if (Offset0Idx == -1 || Offset1Idx == -1)
266 return false;
267
268 // XXX - be careful of dataless loads
269 // getNamedOperandIdx returns the index for MachineInstrs. Since they
270 // include the output in the operand list, but SDNodes don't, we need to
271 // subtract the index by one.
272 Offset0Idx -= get(Opc0).NumDefs;
273 Offset1Idx -= get(Opc1).NumDefs;
274 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
275 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
276 return true;
277 }
278
279 if (isSMRD(Opc0) && isSMRD(Opc1)) {
280 // Skip time and cache invalidation instructions.
281 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
282 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
283 return false;
284
285 unsigned NumOps = getNumOperandsNoGlue(Load0);
286 if (NumOps != getNumOperandsNoGlue(Load1))
287 return false;
288
289 // Check base reg.
290 if (Load0->getOperand(0) != Load1->getOperand(0))
291 return false;
292
293 // Match register offsets, if both register and immediate offsets present.
294 assert(NumOps == 4 || NumOps == 5);
295 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
296 return false;
297
298 const ConstantSDNode *Load0Offset =
299 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
300 const ConstantSDNode *Load1Offset =
301 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
302
303 if (!Load0Offset || !Load1Offset)
304 return false;
305
306 Offset0 = Load0Offset->getZExtValue();
307 Offset1 = Load1Offset->getZExtValue();
308 return true;
309 }
310
311 // MUBUF and MTBUF can access the same addresses.
312 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
313
314 // MUBUF and MTBUF have vaddr at different indices.
315 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
316 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
317 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
318 return false;
319
320 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
321 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
322
323 if (OffIdx0 == -1 || OffIdx1 == -1)
324 return false;
325
326 // getNamedOperandIdx returns the index for MachineInstrs. Since they
327 // include the output in the operand list, but SDNodes don't, we need to
328 // subtract the index by one.
329 OffIdx0 -= get(Opc0).NumDefs;
330 OffIdx1 -= get(Opc1).NumDefs;
331
332 SDValue Off0 = Load0->getOperand(OffIdx0);
333 SDValue Off1 = Load1->getOperand(OffIdx1);
334
335 // The offset might be a FrameIndexSDNode.
336 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
337 return false;
338
339 Offset0 = Off0->getAsZExtVal();
340 Offset1 = Off1->getAsZExtVal();
341 return true;
342 }
343
344 return false;
345}
346
347static bool isStride64(unsigned Opc) {
348 switch (Opc) {
349 case AMDGPU::DS_READ2ST64_B32:
350 case AMDGPU::DS_READ2ST64_B64:
351 case AMDGPU::DS_WRITE2ST64_B32:
352 case AMDGPU::DS_WRITE2ST64_B64:
353 return true;
354 default:
355 return false;
356 }
357}
358
361 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
362 const TargetRegisterInfo *TRI) const {
363 if (!LdSt.mayLoadOrStore())
364 return false;
365
366 unsigned Opc = LdSt.getOpcode();
367 OffsetIsScalable = false;
368 const MachineOperand *BaseOp, *OffsetOp;
369 int DataOpIdx;
370
371 if (isDS(LdSt)) {
372 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
373 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
374 if (OffsetOp) {
375 // Normal, single offset LDS instruction.
376 if (!BaseOp) {
377 // DS_CONSUME/DS_APPEND use M0 for the base address.
378 // TODO: find the implicit use operand for M0 and use that as BaseOp?
379 return false;
380 }
381 BaseOps.push_back(BaseOp);
382 Offset = OffsetOp->getImm();
383 // Get appropriate operand, and compute width accordingly.
384 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
385 if (DataOpIdx == -1)
386 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
387 Width = getOpSize(LdSt, DataOpIdx);
388 } else {
389 // The 2 offset instructions use offset0 and offset1 instead. We can treat
390 // these as a load with a single offset if the 2 offsets are consecutive.
391 // We will use this for some partially aligned loads.
392 const MachineOperand *Offset0Op =
393 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
394 const MachineOperand *Offset1Op =
395 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
396
397 unsigned Offset0 = Offset0Op->getImm() & 0xff;
398 unsigned Offset1 = Offset1Op->getImm() & 0xff;
399 if (Offset0 + 1 != Offset1)
400 return false;
401
402 // Each of these offsets is in element sized units, so we need to convert
403 // to bytes of the individual reads.
404
405 unsigned EltSize;
406 if (LdSt.mayLoad())
407 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
408 else {
409 assert(LdSt.mayStore());
410 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
411 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
412 }
413
414 if (isStride64(Opc))
415 EltSize *= 64;
416
417 BaseOps.push_back(BaseOp);
418 Offset = EltSize * Offset0;
419 // Get appropriate operand(s), and compute width accordingly.
420 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
421 if (DataOpIdx == -1) {
422 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
423 Width = getOpSize(LdSt, DataOpIdx);
424 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
425 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
426 } else {
427 Width = getOpSize(LdSt, DataOpIdx);
428 }
429 }
430 return true;
431 }
432
433 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
434 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
435 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
436 return false;
437 BaseOps.push_back(RSrc);
438 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
439 if (BaseOp && !BaseOp->isFI())
440 BaseOps.push_back(BaseOp);
441 const MachineOperand *OffsetImm =
442 getNamedOperand(LdSt, AMDGPU::OpName::offset);
443 Offset = OffsetImm->getImm();
444 const MachineOperand *SOffset =
445 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
446 if (SOffset) {
447 if (SOffset->isReg())
448 BaseOps.push_back(SOffset);
449 else
450 Offset += SOffset->getImm();
451 }
452 // Get appropriate operand, and compute width accordingly.
453 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
454 if (DataOpIdx == -1)
455 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
456 if (DataOpIdx == -1) // LDS DMA
457 return false;
458 Width = getOpSize(LdSt, DataOpIdx);
459 return true;
460 }
461
462 if (isImage(LdSt)) {
463 auto RsrcOpName =
464 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
465 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
466 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
467 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
468 if (VAddr0Idx >= 0) {
469 // GFX10 possible NSA encoding.
470 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
471 BaseOps.push_back(&LdSt.getOperand(I));
472 } else {
473 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
474 }
475 Offset = 0;
476 // Get appropriate operand, and compute width accordingly.
477 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
478 Width = getOpSize(LdSt, DataOpIdx);
479 return true;
480 }
481
482 if (isSMRD(LdSt)) {
483 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
484 if (!BaseOp) // e.g. S_MEMTIME
485 return false;
486 BaseOps.push_back(BaseOp);
487 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
488 Offset = OffsetOp ? OffsetOp->getImm() : 0;
489 // Get appropriate operand, and compute width accordingly.
490 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
491 if (DataOpIdx == -1)
492 return false;
493 Width = getOpSize(LdSt, DataOpIdx);
494 return true;
495 }
496
497 if (isFLAT(LdSt)) {
498 // Instructions have either vaddr or saddr or both or none.
499 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
500 if (BaseOp)
501 BaseOps.push_back(BaseOp);
502 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
503 if (BaseOp)
504 BaseOps.push_back(BaseOp);
505 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
506 // Get appropriate operand, and compute width accordingly.
507 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
508 if (DataOpIdx == -1)
509 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
510 if (DataOpIdx == -1) // LDS DMA
511 return false;
512 Width = getOpSize(LdSt, DataOpIdx);
513 return true;
514 }
515
516 return false;
517}
518
519static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
521 const MachineInstr &MI2,
523 // Only examine the first "base" operand of each instruction, on the
524 // assumption that it represents the real base address of the memory access.
525 // Other operands are typically offsets or indices from this base address.
526 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
527 return true;
528
529 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
530 return false;
531
532 auto MO1 = *MI1.memoperands_begin();
533 auto MO2 = *MI2.memoperands_begin();
534 if (MO1->getAddrSpace() != MO2->getAddrSpace())
535 return false;
536
537 auto Base1 = MO1->getValue();
538 auto Base2 = MO2->getValue();
539 if (!Base1 || !Base2)
540 return false;
541 Base1 = getUnderlyingObject(Base1);
542 Base2 = getUnderlyingObject(Base2);
543
544 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
545 return false;
546
547 return Base1 == Base2;
548}
549
551 int64_t Offset1, bool OffsetIsScalable1,
553 int64_t Offset2, bool OffsetIsScalable2,
554 unsigned ClusterSize,
555 unsigned NumBytes) const {
556 // If the mem ops (to be clustered) do not have the same base ptr, then they
557 // should not be clustered
558 if (!BaseOps1.empty() && !BaseOps2.empty()) {
559 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
560 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
561 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
562 return false;
563 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
564 // If only one base op is empty, they do not have the same base ptr
565 return false;
566 }
567
568 // In order to avoid register pressure, on an average, the number of DWORDS
569 // loaded together by all clustered mem ops should not exceed 8. This is an
570 // empirical value based on certain observations and performance related
571 // experiments.
572 // The good thing about this heuristic is - it avoids clustering of too many
573 // sub-word loads, and also avoids clustering of wide loads. Below is the
574 // brief summary of how the heuristic behaves for various `LoadSize`.
575 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
576 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
577 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
578 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
579 // (5) LoadSize >= 17: do not cluster
580 const unsigned LoadSize = NumBytes / ClusterSize;
581 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
582 return NumDWORDs <= 8;
583}
584
585// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
586// the first 16 loads will be interleaved with the stores, and the next 16 will
587// be clustered as expected. It should really split into 2 16 store batches.
588//
589// Loads are clustered until this returns false, rather than trying to schedule
590// groups of stores. This also means we have to deal with saying different
591// address space loads should be clustered, and ones which might cause bank
592// conflicts.
593//
594// This might be deprecated so it might not be worth that much effort to fix.
596 int64_t Offset0, int64_t Offset1,
597 unsigned NumLoads) const {
598 assert(Offset1 > Offset0 &&
599 "Second offset should be larger than first offset!");
600 // If we have less than 16 loads in a row, and the offsets are within 64
601 // bytes, then schedule together.
602
603 // A cacheline is 64 bytes (for global memory).
604 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
605}
606
609 const DebugLoc &DL, MCRegister DestReg,
610 MCRegister SrcReg, bool KillSrc,
611 const char *Msg = "illegal VGPR to SGPR copy") {
613 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
615 C.diagnose(IllegalCopy);
616
617 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
618 .addReg(SrcReg, getKillRegState(KillSrc));
619}
620
621/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
622/// possible to have a direct copy in these cases on GFX908, so an intermediate
623/// VGPR copy is required.
627 const DebugLoc &DL, MCRegister DestReg,
628 MCRegister SrcReg, bool KillSrc,
629 RegScavenger &RS, bool RegsOverlap,
630 Register ImpDefSuperReg = Register(),
631 Register ImpUseSuperReg = Register()) {
632 assert((TII.getSubtarget().hasMAIInsts() &&
633 !TII.getSubtarget().hasGFX90AInsts()) &&
634 "Expected GFX908 subtarget.");
635
636 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
637 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
638 "Source register of the copy should be either an SGPR or an AGPR.");
639
640 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
641 "Destination register of the copy should be an AGPR.");
642
643 const SIRegisterInfo &RI = TII.getRegisterInfo();
644
645 // First try to find defining accvgpr_write to avoid temporary registers.
646 // In the case of copies of overlapping AGPRs, we conservatively do not
647 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
648 // an accvgpr_write used for this same copy due to implicit-defs
649 if (!RegsOverlap) {
650 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
651 --Def;
652
653 if (!Def->modifiesRegister(SrcReg, &RI))
654 continue;
655
656 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
657 Def->getOperand(0).getReg() != SrcReg)
658 break;
659
660 MachineOperand &DefOp = Def->getOperand(1);
661 assert(DefOp.isReg() || DefOp.isImm());
662
663 if (DefOp.isReg()) {
664 bool SafeToPropagate = true;
665 // Check that register source operand is not clobbered before MI.
666 // Immediate operands are always safe to propagate.
667 for (auto I = Def; I != MI && SafeToPropagate; ++I)
668 if (I->modifiesRegister(DefOp.getReg(), &RI))
669 SafeToPropagate = false;
670
671 if (!SafeToPropagate)
672 break;
673
674 DefOp.setIsKill(false);
675 }
676
677 MachineInstrBuilder Builder =
678 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
679 .add(DefOp);
680 if (ImpDefSuperReg)
681 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
682
683 if (ImpUseSuperReg) {
684 Builder.addReg(ImpUseSuperReg,
686 }
687
688 return;
689 }
690 }
691
693 RS.backward(std::next(MI));
694
695 // Ideally we want to have three registers for a long reg_sequence copy
696 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
697 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
698 *MBB.getParent());
699
700 // Registers in the sequence are allocated contiguously so we can just
701 // use register number to pick one of three round-robin temps.
702 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
703 Register Tmp =
704 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
706 "VGPR used for an intermediate copy should have been reserved.");
707
708 // Only loop through if there are any free registers left. We don't want to
709 // spill.
710 while (RegNo--) {
711 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
712 /* RestoreAfter */ false, 0,
713 /* AllowSpill */ false);
714 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
715 break;
716 Tmp = Tmp2;
717 RS.setRegUsed(Tmp);
718 }
719
720 // Insert copy to temporary VGPR.
721 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
722 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
723 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
724 } else {
725 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
726 }
727
728 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
729 .addReg(SrcReg, getKillRegState(KillSrc));
730 if (ImpUseSuperReg) {
731 UseBuilder.addReg(ImpUseSuperReg,
733 }
734
735 MachineInstrBuilder DefBuilder
736 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
737 .addReg(Tmp, RegState::Kill);
738
739 if (ImpDefSuperReg)
740 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
741}
742
745 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
746 const TargetRegisterClass *RC, bool Forward) {
747 const SIRegisterInfo &RI = TII.getRegisterInfo();
748 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
750 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
751
752 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
753 int16_t SubIdx = BaseIndices[Idx];
754 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
755 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
756 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
757 unsigned Opcode = AMDGPU::S_MOV_B32;
758
759 // Is SGPR aligned? If so try to combine with next.
760 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
761 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
762 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
763 // Can use SGPR64 copy
764 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
765 SubIdx = RI.getSubRegFromChannel(Channel, 2);
766 DestSubReg = RI.getSubReg(DestReg, SubIdx);
767 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
768 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
769 Opcode = AMDGPU::S_MOV_B64;
770 Idx++;
771 }
772
773 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
774 .addReg(SrcSubReg)
775 .addReg(SrcReg, RegState::Implicit);
776
777 if (!FirstMI)
778 FirstMI = LastMI;
779
780 if (!Forward)
781 I--;
782 }
783
784 assert(FirstMI && LastMI);
785 if (!Forward)
786 std::swap(FirstMI, LastMI);
787
788 FirstMI->addOperand(
789 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
790
791 if (KillSrc)
792 LastMI->addRegisterKilled(SrcReg, &RI);
793}
794
797 const DebugLoc &DL, MCRegister DestReg,
798 MCRegister SrcReg, bool KillSrc) const {
799 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
800 unsigned Size = RI.getRegSizeInBits(*RC);
801 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
802 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
803
804 // The rest of copyPhysReg assumes Src and Dst size are the same size.
805 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
806 // we remove Fix16BitCopies and this code block?
807 if (Fix16BitCopies) {
808 if (((Size == 16) != (SrcSize == 16))) {
809 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
811 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
812 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
813 RegToFix = SubReg;
814
815 if (DestReg == SrcReg) {
816 // Identity copy. Insert empty bundle since ExpandPostRA expects an
817 // instruction here.
818 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
819 return;
820 }
821 RC = RI.getPhysRegBaseClass(DestReg);
822 Size = RI.getRegSizeInBits(*RC);
823 SrcRC = RI.getPhysRegBaseClass(SrcReg);
824 SrcSize = RI.getRegSizeInBits(*SrcRC);
825 }
826 }
827
828 if (RC == &AMDGPU::VGPR_32RegClass) {
829 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
830 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
831 AMDGPU::AGPR_32RegClass.contains(SrcReg));
832 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
833 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
834 BuildMI(MBB, MI, DL, get(Opc), DestReg)
835 .addReg(SrcReg, getKillRegState(KillSrc));
836 return;
837 }
838
839 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
840 RC == &AMDGPU::SReg_32RegClass) {
841 if (SrcReg == AMDGPU::SCC) {
842 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
843 .addImm(1)
844 .addImm(0);
845 return;
846 }
847
848 if (DestReg == AMDGPU::VCC_LO) {
849 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
850 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
851 .addReg(SrcReg, getKillRegState(KillSrc));
852 } else {
853 // FIXME: Hack until VReg_1 removed.
854 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
855 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
856 .addImm(0)
857 .addReg(SrcReg, getKillRegState(KillSrc));
858 }
859
860 return;
861 }
862
863 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
864 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
865 return;
866 }
867
868 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
869 .addReg(SrcReg, getKillRegState(KillSrc));
870 return;
871 }
872
873 if (RC == &AMDGPU::SReg_64RegClass) {
874 if (SrcReg == AMDGPU::SCC) {
875 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
876 .addImm(1)
877 .addImm(0);
878 return;
879 }
880
881 if (DestReg == AMDGPU::VCC) {
882 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 } else {
886 // FIXME: Hack until VReg_1 removed.
887 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
888 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
889 .addImm(0)
890 .addReg(SrcReg, getKillRegState(KillSrc));
891 }
892
893 return;
894 }
895
896 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
897 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
898 return;
899 }
900
901 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
904 }
905
906 if (DestReg == AMDGPU::SCC) {
907 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
908 // but SelectionDAG emits such copies for i1 sources.
909 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
910 // This copy can only be produced by patterns
911 // with explicit SCC, which are known to be enabled
912 // only for subtargets with S_CMP_LG_U64 present.
914 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
915 .addReg(SrcReg, getKillRegState(KillSrc))
916 .addImm(0);
917 } else {
918 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
919 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
920 .addReg(SrcReg, getKillRegState(KillSrc))
921 .addImm(0);
922 }
923
924 return;
925 }
926
927 if (RC == &AMDGPU::AGPR_32RegClass) {
928 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
929 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
930 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
931 .addReg(SrcReg, getKillRegState(KillSrc));
932 return;
933 }
934
935 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
936 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
937 .addReg(SrcReg, getKillRegState(KillSrc));
938 return;
939 }
940
941 // FIXME: Pass should maintain scavenger to avoid scan through the block on
942 // every AGPR spill.
943 RegScavenger RS;
944 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
945 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
946 return;
947 }
948
949 if (Size == 16) {
950 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
951 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
952 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
953
954 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
955 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
956 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
957 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
958 bool DstLow = !AMDGPU::isHi(DestReg, RI);
959 bool SrcLow = !AMDGPU::isHi(SrcReg, RI);
960 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
961 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
962
963 if (IsSGPRDst) {
964 if (!IsSGPRSrc) {
965 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
966 return;
967 }
968
969 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
970 .addReg(NewSrcReg, getKillRegState(KillSrc));
971 return;
972 }
973
974 if (IsAGPRDst || IsAGPRSrc) {
975 if (!DstLow || !SrcLow) {
976 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
977 "Cannot use hi16 subreg with an AGPR!");
978 }
979
980 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
981 return;
982 }
983
984 if (ST.hasTrue16BitInsts()) {
985 if (IsSGPRSrc) {
986 assert(SrcLow);
987 SrcReg = NewSrcReg;
988 }
989 // Use the smaller instruction encoding if possible.
990 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
991 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
992 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
993 .addReg(SrcReg);
994 } else {
995 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
996 .addImm(0) // src0_modifiers
997 .addReg(SrcReg)
998 .addImm(0); // op_sel
999 }
1000 return;
1001 }
1002
1003 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1004 if (!DstLow || !SrcLow) {
1005 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1006 "Cannot use hi16 subreg on VI!");
1007 }
1008
1009 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1010 .addReg(NewSrcReg, getKillRegState(KillSrc));
1011 return;
1012 }
1013
1014 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1015 .addImm(0) // src0_modifiers
1016 .addReg(NewSrcReg)
1017 .addImm(0) // clamp
1024 // First implicit operand is $exec.
1025 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1026 return;
1027 }
1028
1029 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1030 if (ST.hasMovB64()) {
1031 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1032 .addReg(SrcReg, getKillRegState(KillSrc));
1033 return;
1034 }
1035 if (ST.hasPkMovB32()) {
1036 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1038 .addReg(SrcReg)
1040 .addReg(SrcReg)
1041 .addImm(0) // op_sel_lo
1042 .addImm(0) // op_sel_hi
1043 .addImm(0) // neg_lo
1044 .addImm(0) // neg_hi
1045 .addImm(0) // clamp
1046 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1047 return;
1048 }
1049 }
1050
1051 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1052 if (RI.isSGPRClass(RC)) {
1053 if (!RI.isSGPRClass(SrcRC)) {
1054 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1055 return;
1056 }
1057 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1058 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1059 Forward);
1060 return;
1061 }
1062
1063 unsigned EltSize = 4;
1064 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1065 if (RI.isAGPRClass(RC)) {
1066 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1067 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1068 else if (RI.hasVGPRs(SrcRC) ||
1069 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1070 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1071 else
1072 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1073 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1074 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1075 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1076 (RI.isProperlyAlignedRC(*RC) &&
1077 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1078 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1079 if (ST.hasMovB64()) {
1080 Opcode = AMDGPU::V_MOV_B64_e32;
1081 EltSize = 8;
1082 } else if (ST.hasPkMovB32()) {
1083 Opcode = AMDGPU::V_PK_MOV_B32;
1084 EltSize = 8;
1085 }
1086 }
1087
1088 // For the cases where we need an intermediate instruction/temporary register
1089 // (destination is an AGPR), we need a scavenger.
1090 //
1091 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1092 // whole block for every handled copy.
1093 std::unique_ptr<RegScavenger> RS;
1094 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1095 RS.reset(new RegScavenger());
1096
1097 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1098
1099 // If there is an overlap, we can't kill the super-register on the last
1100 // instruction, since it will also kill the components made live by this def.
1101 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1102 const bool CanKillSuperReg = KillSrc && !Overlap;
1103
1104 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1105 unsigned SubIdx;
1106 if (Forward)
1107 SubIdx = SubIndices[Idx];
1108 else
1109 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1110 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1111 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1112 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1113
1114 bool IsFirstSubreg = Idx == 0;
1115 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1116
1117 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1118 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1119 Register ImpUseSuper = SrcReg;
1120 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1121 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1122 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1124 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1126 .addReg(SrcSubReg)
1128 .addReg(SrcSubReg)
1129 .addImm(0) // op_sel_lo
1130 .addImm(0) // op_sel_hi
1131 .addImm(0) // neg_lo
1132 .addImm(0) // neg_hi
1133 .addImm(0) // clamp
1134 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1135 if (IsFirstSubreg)
1137 } else {
1138 MachineInstrBuilder Builder =
1139 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1140 if (IsFirstSubreg)
1141 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1142
1143 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144 }
1145 }
1146}
1147
1148int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1149 int NewOpc;
1150
1151 // Try to map original to commuted opcode
1152 NewOpc = AMDGPU::getCommuteRev(Opcode);
1153 if (NewOpc != -1)
1154 // Check if the commuted (REV) opcode exists on the target.
1155 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1156
1157 // Try to map commuted to original opcode
1158 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1159 if (NewOpc != -1)
1160 // Check if the original (non-REV) opcode exists on the target.
1161 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1162
1163 return Opcode;
1164}
1165
1168 const DebugLoc &DL, Register DestReg,
1169 int64_t Value) const {
1171 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1172 if (RegClass == &AMDGPU::SReg_32RegClass ||
1173 RegClass == &AMDGPU::SGPR_32RegClass ||
1174 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1175 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1176 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1177 .addImm(Value);
1178 return;
1179 }
1180
1181 if (RegClass == &AMDGPU::SReg_64RegClass ||
1182 RegClass == &AMDGPU::SGPR_64RegClass ||
1183 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1184 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1185 .addImm(Value);
1186 return;
1187 }
1188
1189 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1190 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1191 .addImm(Value);
1192 return;
1193 }
1194 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1195 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1196 .addImm(Value);
1197 return;
1198 }
1199
1200 unsigned EltSize = 4;
1201 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1202 if (RI.isSGPRClass(RegClass)) {
1203 if (RI.getRegSizeInBits(*RegClass) > 32) {
1204 Opcode = AMDGPU::S_MOV_B64;
1205 EltSize = 8;
1206 } else {
1207 Opcode = AMDGPU::S_MOV_B32;
1208 EltSize = 4;
1209 }
1210 }
1211
1212 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1213 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1214 int64_t IdxValue = Idx == 0 ? Value : 0;
1215
1216 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1217 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1218 Builder.addImm(IdxValue);
1219 }
1220}
1221
1222const TargetRegisterClass *
1224 return &AMDGPU::VGPR_32RegClass;
1225}
1226
1229 const DebugLoc &DL, Register DstReg,
1231 Register TrueReg,
1232 Register FalseReg) const {
1234 const TargetRegisterClass *BoolXExecRC =
1235 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1236 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1237 "Not a VGPR32 reg");
1238
1239 if (Cond.size() == 1) {
1240 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1241 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1242 .add(Cond[0]);
1243 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1244 .addImm(0)
1245 .addReg(FalseReg)
1246 .addImm(0)
1247 .addReg(TrueReg)
1248 .addReg(SReg);
1249 } else if (Cond.size() == 2) {
1250 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1251 switch (Cond[0].getImm()) {
1252 case SIInstrInfo::SCC_TRUE: {
1253 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1254 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1255 : AMDGPU::S_CSELECT_B64), SReg)
1256 .addImm(1)
1257 .addImm(0);
1258 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1259 .addImm(0)
1260 .addReg(FalseReg)
1261 .addImm(0)
1262 .addReg(TrueReg)
1263 .addReg(SReg);
1264 break;
1265 }
1266 case SIInstrInfo::SCC_FALSE: {
1267 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1268 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1269 : AMDGPU::S_CSELECT_B64), SReg)
1270 .addImm(0)
1271 .addImm(1);
1272 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1273 .addImm(0)
1274 .addReg(FalseReg)
1275 .addImm(0)
1276 .addReg(TrueReg)
1277 .addReg(SReg);
1278 break;
1279 }
1280 case SIInstrInfo::VCCNZ: {
1281 MachineOperand RegOp = Cond[1];
1282 RegOp.setImplicit(false);
1283 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1284 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1285 .add(RegOp);
1286 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1287 .addImm(0)
1288 .addReg(FalseReg)
1289 .addImm(0)
1290 .addReg(TrueReg)
1291 .addReg(SReg);
1292 break;
1293 }
1294 case SIInstrInfo::VCCZ: {
1295 MachineOperand RegOp = Cond[1];
1296 RegOp.setImplicit(false);
1297 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1298 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1299 .add(RegOp);
1300 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1301 .addImm(0)
1302 .addReg(TrueReg)
1303 .addImm(0)
1304 .addReg(FalseReg)
1305 .addReg(SReg);
1306 break;
1307 }
1308 case SIInstrInfo::EXECNZ: {
1309 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1310 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1311 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1312 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1313 .addImm(0);
1314 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1315 : AMDGPU::S_CSELECT_B64), SReg)
1316 .addImm(1)
1317 .addImm(0);
1318 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1319 .addImm(0)
1320 .addReg(FalseReg)
1321 .addImm(0)
1322 .addReg(TrueReg)
1323 .addReg(SReg);
1324 break;
1325 }
1326 case SIInstrInfo::EXECZ: {
1327 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1328 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1329 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1330 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1331 .addImm(0);
1332 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1333 : AMDGPU::S_CSELECT_B64), SReg)
1334 .addImm(0)
1335 .addImm(1);
1336 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1337 .addImm(0)
1338 .addReg(FalseReg)
1339 .addImm(0)
1340 .addReg(TrueReg)
1341 .addReg(SReg);
1342 llvm_unreachable("Unhandled branch predicate EXECZ");
1343 break;
1344 }
1345 default:
1346 llvm_unreachable("invalid branch predicate");
1347 }
1348 } else {
1349 llvm_unreachable("Can only handle Cond size 1 or 2");
1350 }
1351}
1352
1355 const DebugLoc &DL,
1356 Register SrcReg, int Value) const {
1358 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1359 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1360 .addImm(Value)
1361 .addReg(SrcReg);
1362
1363 return Reg;
1364}
1365
1368 const DebugLoc &DL,
1369 Register SrcReg, int Value) const {
1371 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1372 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1373 .addImm(Value)
1374 .addReg(SrcReg);
1375
1376 return Reg;
1377}
1378
1380
1381 if (RI.isAGPRClass(DstRC))
1382 return AMDGPU::COPY;
1383 if (RI.getRegSizeInBits(*DstRC) == 16) {
1384 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1385 // before RA.
1386 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1387 } else if (RI.getRegSizeInBits(*DstRC) == 32) {
1388 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1389 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
1390 return AMDGPU::S_MOV_B64;
1391 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
1392 return AMDGPU::V_MOV_B64_PSEUDO;
1393 }
1394 return AMDGPU::COPY;
1395}
1396
1397const MCInstrDesc &
1399 bool IsIndirectSrc) const {
1400 if (IsIndirectSrc) {
1401 if (VecSize <= 32) // 4 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1403 if (VecSize <= 64) // 8 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1405 if (VecSize <= 96) // 12 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1407 if (VecSize <= 128) // 16 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1409 if (VecSize <= 160) // 20 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1411 if (VecSize <= 256) // 32 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1413 if (VecSize <= 288) // 36 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1415 if (VecSize <= 320) // 40 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1417 if (VecSize <= 352) // 44 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1419 if (VecSize <= 384) // 48 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1421 if (VecSize <= 512) // 64 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1423 if (VecSize <= 1024) // 128 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1425
1426 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1427 }
1428
1429 if (VecSize <= 32) // 4 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1431 if (VecSize <= 64) // 8 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1433 if (VecSize <= 96) // 12 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1435 if (VecSize <= 128) // 16 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1437 if (VecSize <= 160) // 20 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1439 if (VecSize <= 256) // 32 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1441 if (VecSize <= 288) // 36 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1443 if (VecSize <= 320) // 40 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1445 if (VecSize <= 352) // 44 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1447 if (VecSize <= 384) // 48 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1449 if (VecSize <= 512) // 64 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1451 if (VecSize <= 1024) // 128 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1453
1454 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1455}
1456
1457static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1458 if (VecSize <= 32) // 4 bytes
1459 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1460 if (VecSize <= 64) // 8 bytes
1461 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1462 if (VecSize <= 96) // 12 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1464 if (VecSize <= 128) // 16 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1466 if (VecSize <= 160) // 20 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1468 if (VecSize <= 256) // 32 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1470 if (VecSize <= 288) // 36 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1472 if (VecSize <= 320) // 40 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1474 if (VecSize <= 352) // 44 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1476 if (VecSize <= 384) // 48 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1478 if (VecSize <= 512) // 64 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1480 if (VecSize <= 1024) // 128 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1482
1483 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1484}
1485
1486static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1487 if (VecSize <= 32) // 4 bytes
1488 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1489 if (VecSize <= 64) // 8 bytes
1490 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1491 if (VecSize <= 96) // 12 bytes
1492 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1493 if (VecSize <= 128) // 16 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1495 if (VecSize <= 160) // 20 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1497 if (VecSize <= 256) // 32 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1499 if (VecSize <= 288) // 36 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1501 if (VecSize <= 320) // 40 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1503 if (VecSize <= 352) // 44 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1505 if (VecSize <= 384) // 48 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1507 if (VecSize <= 512) // 64 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1509 if (VecSize <= 1024) // 128 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1511
1512 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1513}
1514
1515static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1516 if (VecSize <= 64) // 8 bytes
1517 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1518 if (VecSize <= 128) // 16 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1520 if (VecSize <= 256) // 32 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1522 if (VecSize <= 512) // 64 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1524 if (VecSize <= 1024) // 128 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1526
1527 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1528}
1529
1530const MCInstrDesc &
1531SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1532 bool IsSGPR) const {
1533 if (IsSGPR) {
1534 switch (EltSize) {
1535 case 32:
1536 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1537 case 64:
1538 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1539 default:
1540 llvm_unreachable("invalid reg indexing elt size");
1541 }
1542 }
1543
1544 assert(EltSize == 32 && "invalid reg indexing elt size");
1546}
1547
1548static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1549 switch (Size) {
1550 case 4:
1551 return AMDGPU::SI_SPILL_S32_SAVE;
1552 case 8:
1553 return AMDGPU::SI_SPILL_S64_SAVE;
1554 case 12:
1555 return AMDGPU::SI_SPILL_S96_SAVE;
1556 case 16:
1557 return AMDGPU::SI_SPILL_S128_SAVE;
1558 case 20:
1559 return AMDGPU::SI_SPILL_S160_SAVE;
1560 case 24:
1561 return AMDGPU::SI_SPILL_S192_SAVE;
1562 case 28:
1563 return AMDGPU::SI_SPILL_S224_SAVE;
1564 case 32:
1565 return AMDGPU::SI_SPILL_S256_SAVE;
1566 case 36:
1567 return AMDGPU::SI_SPILL_S288_SAVE;
1568 case 40:
1569 return AMDGPU::SI_SPILL_S320_SAVE;
1570 case 44:
1571 return AMDGPU::SI_SPILL_S352_SAVE;
1572 case 48:
1573 return AMDGPU::SI_SPILL_S384_SAVE;
1574 case 64:
1575 return AMDGPU::SI_SPILL_S512_SAVE;
1576 case 128:
1577 return AMDGPU::SI_SPILL_S1024_SAVE;
1578 default:
1579 llvm_unreachable("unknown register size");
1580 }
1581}
1582
1583static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1584 switch (Size) {
1585 case 4:
1586 return AMDGPU::SI_SPILL_V32_SAVE;
1587 case 8:
1588 return AMDGPU::SI_SPILL_V64_SAVE;
1589 case 12:
1590 return AMDGPU::SI_SPILL_V96_SAVE;
1591 case 16:
1592 return AMDGPU::SI_SPILL_V128_SAVE;
1593 case 20:
1594 return AMDGPU::SI_SPILL_V160_SAVE;
1595 case 24:
1596 return AMDGPU::SI_SPILL_V192_SAVE;
1597 case 28:
1598 return AMDGPU::SI_SPILL_V224_SAVE;
1599 case 32:
1600 return AMDGPU::SI_SPILL_V256_SAVE;
1601 case 36:
1602 return AMDGPU::SI_SPILL_V288_SAVE;
1603 case 40:
1604 return AMDGPU::SI_SPILL_V320_SAVE;
1605 case 44:
1606 return AMDGPU::SI_SPILL_V352_SAVE;
1607 case 48:
1608 return AMDGPU::SI_SPILL_V384_SAVE;
1609 case 64:
1610 return AMDGPU::SI_SPILL_V512_SAVE;
1611 case 128:
1612 return AMDGPU::SI_SPILL_V1024_SAVE;
1613 default:
1614 llvm_unreachable("unknown register size");
1615 }
1616}
1617
1618static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1619 switch (Size) {
1620 case 4:
1621 return AMDGPU::SI_SPILL_A32_SAVE;
1622 case 8:
1623 return AMDGPU::SI_SPILL_A64_SAVE;
1624 case 12:
1625 return AMDGPU::SI_SPILL_A96_SAVE;
1626 case 16:
1627 return AMDGPU::SI_SPILL_A128_SAVE;
1628 case 20:
1629 return AMDGPU::SI_SPILL_A160_SAVE;
1630 case 24:
1631 return AMDGPU::SI_SPILL_A192_SAVE;
1632 case 28:
1633 return AMDGPU::SI_SPILL_A224_SAVE;
1634 case 32:
1635 return AMDGPU::SI_SPILL_A256_SAVE;
1636 case 36:
1637 return AMDGPU::SI_SPILL_A288_SAVE;
1638 case 40:
1639 return AMDGPU::SI_SPILL_A320_SAVE;
1640 case 44:
1641 return AMDGPU::SI_SPILL_A352_SAVE;
1642 case 48:
1643 return AMDGPU::SI_SPILL_A384_SAVE;
1644 case 64:
1645 return AMDGPU::SI_SPILL_A512_SAVE;
1646 case 128:
1647 return AMDGPU::SI_SPILL_A1024_SAVE;
1648 default:
1649 llvm_unreachable("unknown register size");
1650 }
1651}
1652
1653static unsigned getAVSpillSaveOpcode(unsigned Size) {
1654 switch (Size) {
1655 case 4:
1656 return AMDGPU::SI_SPILL_AV32_SAVE;
1657 case 8:
1658 return AMDGPU::SI_SPILL_AV64_SAVE;
1659 case 12:
1660 return AMDGPU::SI_SPILL_AV96_SAVE;
1661 case 16:
1662 return AMDGPU::SI_SPILL_AV128_SAVE;
1663 case 20:
1664 return AMDGPU::SI_SPILL_AV160_SAVE;
1665 case 24:
1666 return AMDGPU::SI_SPILL_AV192_SAVE;
1667 case 28:
1668 return AMDGPU::SI_SPILL_AV224_SAVE;
1669 case 32:
1670 return AMDGPU::SI_SPILL_AV256_SAVE;
1671 case 36:
1672 return AMDGPU::SI_SPILL_AV288_SAVE;
1673 case 40:
1674 return AMDGPU::SI_SPILL_AV320_SAVE;
1675 case 44:
1676 return AMDGPU::SI_SPILL_AV352_SAVE;
1677 case 48:
1678 return AMDGPU::SI_SPILL_AV384_SAVE;
1679 case 64:
1680 return AMDGPU::SI_SPILL_AV512_SAVE;
1681 case 128:
1682 return AMDGPU::SI_SPILL_AV1024_SAVE;
1683 default:
1684 llvm_unreachable("unknown register size");
1685 }
1686}
1687
1688static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1689 bool IsVectorSuperClass) {
1690 // Currently, there is only 32-bit WWM register spills needed.
1691 if (Size != 4)
1692 llvm_unreachable("unknown wwm register spill size");
1693
1694 if (IsVectorSuperClass)
1695 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1696
1697 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1698}
1699
1701 const TargetRegisterClass *RC,
1702 unsigned Size,
1703 const SIRegisterInfo &TRI,
1704 const SIMachineFunctionInfo &MFI) {
1705 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1706
1707 // Choose the right opcode if spilling a WWM register.
1709 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1710
1711 if (IsVectorSuperClass)
1712 return getAVSpillSaveOpcode(Size);
1713
1714 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1716}
1717
1720 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1721 const TargetRegisterInfo *TRI, Register VReg) const {
1724 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1725 const DebugLoc &DL = MBB.findDebugLoc(MI);
1726
1727 MachinePointerInfo PtrInfo
1728 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1730 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1731 FrameInfo.getObjectAlign(FrameIndex));
1732 unsigned SpillSize = TRI->getSpillSize(*RC);
1733
1735 if (RI.isSGPRClass(RC)) {
1736 MFI->setHasSpilledSGPRs();
1737 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1738 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1739 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1740
1741 // We are only allowed to create one new instruction when spilling
1742 // registers, so we need to use pseudo instruction for spilling SGPRs.
1743 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1744
1745 // The SGPR spill/restore instructions only work on number sgprs, so we need
1746 // to make sure we are using the correct register class.
1747 if (SrcReg.isVirtual() && SpillSize == 4) {
1748 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1749 }
1750
1751 BuildMI(MBB, MI, DL, OpDesc)
1752 .addReg(SrcReg, getKillRegState(isKill)) // data
1753 .addFrameIndex(FrameIndex) // addr
1754 .addMemOperand(MMO)
1756
1757 if (RI.spillSGPRToVGPR())
1758 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1759 return;
1760 }
1761
1762 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1763 SpillSize, RI, *MFI);
1764 MFI->setHasSpilledVGPRs();
1765
1766 BuildMI(MBB, MI, DL, get(Opcode))
1767 .addReg(SrcReg, getKillRegState(isKill)) // data
1768 .addFrameIndex(FrameIndex) // addr
1769 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1770 .addImm(0) // offset
1771 .addMemOperand(MMO);
1772}
1773
1774static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1775 switch (Size) {
1776 case 4:
1777 return AMDGPU::SI_SPILL_S32_RESTORE;
1778 case 8:
1779 return AMDGPU::SI_SPILL_S64_RESTORE;
1780 case 12:
1781 return AMDGPU::SI_SPILL_S96_RESTORE;
1782 case 16:
1783 return AMDGPU::SI_SPILL_S128_RESTORE;
1784 case 20:
1785 return AMDGPU::SI_SPILL_S160_RESTORE;
1786 case 24:
1787 return AMDGPU::SI_SPILL_S192_RESTORE;
1788 case 28:
1789 return AMDGPU::SI_SPILL_S224_RESTORE;
1790 case 32:
1791 return AMDGPU::SI_SPILL_S256_RESTORE;
1792 case 36:
1793 return AMDGPU::SI_SPILL_S288_RESTORE;
1794 case 40:
1795 return AMDGPU::SI_SPILL_S320_RESTORE;
1796 case 44:
1797 return AMDGPU::SI_SPILL_S352_RESTORE;
1798 case 48:
1799 return AMDGPU::SI_SPILL_S384_RESTORE;
1800 case 64:
1801 return AMDGPU::SI_SPILL_S512_RESTORE;
1802 case 128:
1803 return AMDGPU::SI_SPILL_S1024_RESTORE;
1804 default:
1805 llvm_unreachable("unknown register size");
1806 }
1807}
1808
1809static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1810 switch (Size) {
1811 case 4:
1812 return AMDGPU::SI_SPILL_V32_RESTORE;
1813 case 8:
1814 return AMDGPU::SI_SPILL_V64_RESTORE;
1815 case 12:
1816 return AMDGPU::SI_SPILL_V96_RESTORE;
1817 case 16:
1818 return AMDGPU::SI_SPILL_V128_RESTORE;
1819 case 20:
1820 return AMDGPU::SI_SPILL_V160_RESTORE;
1821 case 24:
1822 return AMDGPU::SI_SPILL_V192_RESTORE;
1823 case 28:
1824 return AMDGPU::SI_SPILL_V224_RESTORE;
1825 case 32:
1826 return AMDGPU::SI_SPILL_V256_RESTORE;
1827 case 36:
1828 return AMDGPU::SI_SPILL_V288_RESTORE;
1829 case 40:
1830 return AMDGPU::SI_SPILL_V320_RESTORE;
1831 case 44:
1832 return AMDGPU::SI_SPILL_V352_RESTORE;
1833 case 48:
1834 return AMDGPU::SI_SPILL_V384_RESTORE;
1835 case 64:
1836 return AMDGPU::SI_SPILL_V512_RESTORE;
1837 case 128:
1838 return AMDGPU::SI_SPILL_V1024_RESTORE;
1839 default:
1840 llvm_unreachable("unknown register size");
1841 }
1842}
1843
1844static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1845 switch (Size) {
1846 case 4:
1847 return AMDGPU::SI_SPILL_A32_RESTORE;
1848 case 8:
1849 return AMDGPU::SI_SPILL_A64_RESTORE;
1850 case 12:
1851 return AMDGPU::SI_SPILL_A96_RESTORE;
1852 case 16:
1853 return AMDGPU::SI_SPILL_A128_RESTORE;
1854 case 20:
1855 return AMDGPU::SI_SPILL_A160_RESTORE;
1856 case 24:
1857 return AMDGPU::SI_SPILL_A192_RESTORE;
1858 case 28:
1859 return AMDGPU::SI_SPILL_A224_RESTORE;
1860 case 32:
1861 return AMDGPU::SI_SPILL_A256_RESTORE;
1862 case 36:
1863 return AMDGPU::SI_SPILL_A288_RESTORE;
1864 case 40:
1865 return AMDGPU::SI_SPILL_A320_RESTORE;
1866 case 44:
1867 return AMDGPU::SI_SPILL_A352_RESTORE;
1868 case 48:
1869 return AMDGPU::SI_SPILL_A384_RESTORE;
1870 case 64:
1871 return AMDGPU::SI_SPILL_A512_RESTORE;
1872 case 128:
1873 return AMDGPU::SI_SPILL_A1024_RESTORE;
1874 default:
1875 llvm_unreachable("unknown register size");
1876 }
1877}
1878
1879static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1880 switch (Size) {
1881 case 4:
1882 return AMDGPU::SI_SPILL_AV32_RESTORE;
1883 case 8:
1884 return AMDGPU::SI_SPILL_AV64_RESTORE;
1885 case 12:
1886 return AMDGPU::SI_SPILL_AV96_RESTORE;
1887 case 16:
1888 return AMDGPU::SI_SPILL_AV128_RESTORE;
1889 case 20:
1890 return AMDGPU::SI_SPILL_AV160_RESTORE;
1891 case 24:
1892 return AMDGPU::SI_SPILL_AV192_RESTORE;
1893 case 28:
1894 return AMDGPU::SI_SPILL_AV224_RESTORE;
1895 case 32:
1896 return AMDGPU::SI_SPILL_AV256_RESTORE;
1897 case 36:
1898 return AMDGPU::SI_SPILL_AV288_RESTORE;
1899 case 40:
1900 return AMDGPU::SI_SPILL_AV320_RESTORE;
1901 case 44:
1902 return AMDGPU::SI_SPILL_AV352_RESTORE;
1903 case 48:
1904 return AMDGPU::SI_SPILL_AV384_RESTORE;
1905 case 64:
1906 return AMDGPU::SI_SPILL_AV512_RESTORE;
1907 case 128:
1908 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1909 default:
1910 llvm_unreachable("unknown register size");
1911 }
1912}
1913
1914static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1915 bool IsVectorSuperClass) {
1916 // Currently, there is only 32-bit WWM register spills needed.
1917 if (Size != 4)
1918 llvm_unreachable("unknown wwm register spill size");
1919
1920 if (IsVectorSuperClass)
1921 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1922
1923 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1924}
1925
1926static unsigned
1928 unsigned Size, const SIRegisterInfo &TRI,
1929 const SIMachineFunctionInfo &MFI) {
1930 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1931
1932 // Choose the right opcode if restoring a WWM register.
1934 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1935
1936 if (IsVectorSuperClass)
1938
1939 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1941}
1942
1945 Register DestReg, int FrameIndex,
1946 const TargetRegisterClass *RC,
1947 const TargetRegisterInfo *TRI,
1948 Register VReg) const {
1951 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1952 const DebugLoc &DL = MBB.findDebugLoc(MI);
1953 unsigned SpillSize = TRI->getSpillSize(*RC);
1954
1955 MachinePointerInfo PtrInfo
1956 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1957
1959 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1960 FrameInfo.getObjectAlign(FrameIndex));
1961
1962 if (RI.isSGPRClass(RC)) {
1963 MFI->setHasSpilledSGPRs();
1964 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1965 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1966 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1967
1968 // FIXME: Maybe this should not include a memoperand because it will be
1969 // lowered to non-memory instructions.
1970 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1971 if (DestReg.isVirtual() && SpillSize == 4) {
1973 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1974 }
1975
1976 if (RI.spillSGPRToVGPR())
1977 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1978 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1979 .addFrameIndex(FrameIndex) // addr
1980 .addMemOperand(MMO)
1982
1983 return;
1984 }
1985
1986 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1987 SpillSize, RI, *MFI);
1988 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1989 .addFrameIndex(FrameIndex) // vaddr
1990 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1991 .addImm(0) // offset
1992 .addMemOperand(MMO);
1993}
1994
1997 insertNoops(MBB, MI, 1);
1998}
1999
2002 unsigned Quantity) const {
2004 while (Quantity > 0) {
2005 unsigned Arg = std::min(Quantity, 8u);
2006 Quantity -= Arg;
2007 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2008 }
2009}
2010
2012 auto MF = MBB.getParent();
2014
2015 assert(Info->isEntryFunction());
2016
2017 if (MBB.succ_empty()) {
2018 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2019 if (HasNoTerminator) {
2020 if (Info->returnsVoid()) {
2021 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2022 } else {
2023 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2024 }
2025 }
2026 }
2027}
2028
2032 const DebugLoc &DL) const {
2034 constexpr unsigned DoorbellIDMask = 0x3ff;
2035 constexpr unsigned ECQueueWaveAbort = 0x400;
2036
2037 MachineBasicBlock *TrapBB = &MBB;
2038 MachineBasicBlock *ContBB = &MBB;
2039 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2040
2041 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2042 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2043 TrapBB = MF->CreateMachineBasicBlock();
2044 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2045 MF->push_back(TrapBB);
2046 MBB.addSuccessor(TrapBB);
2047 }
2048
2049 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2050 // will be a nop.
2051 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2052 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2053 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2054 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2055 DoorbellReg)
2057 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2058 .addUse(AMDGPU::M0);
2059 Register DoorbellRegMasked =
2060 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2061 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2062 .addUse(DoorbellReg)
2063 .addImm(DoorbellIDMask);
2064 Register SetWaveAbortBit =
2065 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2066 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2067 .addUse(DoorbellRegMasked)
2068 .addImm(ECQueueWaveAbort);
2069 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2070 .addUse(SetWaveAbortBit);
2071 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2073 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2074 .addUse(AMDGPU::TTMP2);
2075 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2076 TrapBB->addSuccessor(HaltLoopBB);
2077
2078 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2079 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2080 .addMBB(HaltLoopBB);
2081 MF->push_back(HaltLoopBB);
2082 HaltLoopBB->addSuccessor(HaltLoopBB);
2083
2084 return ContBB;
2085}
2086
2088 switch (MI.getOpcode()) {
2089 default:
2090 if (MI.isMetaInstruction())
2091 return 0;
2092 return 1; // FIXME: Do wait states equal cycles?
2093
2094 case AMDGPU::S_NOP:
2095 return MI.getOperand(0).getImm() + 1;
2096 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2097 // hazard, even if one exist, won't really be visible. Should we handle it?
2098 }
2099}
2100
2102 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2103 MachineBasicBlock &MBB = *MI.getParent();
2105 switch (MI.getOpcode()) {
2106 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2107 case AMDGPU::S_MOV_B64_term:
2108 // This is only a terminator to get the correct spill code placement during
2109 // register allocation.
2110 MI.setDesc(get(AMDGPU::S_MOV_B64));
2111 break;
2112
2113 case AMDGPU::S_MOV_B32_term:
2114 // This is only a terminator to get the correct spill code placement during
2115 // register allocation.
2116 MI.setDesc(get(AMDGPU::S_MOV_B32));
2117 break;
2118
2119 case AMDGPU::S_XOR_B64_term:
2120 // This is only a terminator to get the correct spill code placement during
2121 // register allocation.
2122 MI.setDesc(get(AMDGPU::S_XOR_B64));
2123 break;
2124
2125 case AMDGPU::S_XOR_B32_term:
2126 // This is only a terminator to get the correct spill code placement during
2127 // register allocation.
2128 MI.setDesc(get(AMDGPU::S_XOR_B32));
2129 break;
2130 case AMDGPU::S_OR_B64_term:
2131 // This is only a terminator to get the correct spill code placement during
2132 // register allocation.
2133 MI.setDesc(get(AMDGPU::S_OR_B64));
2134 break;
2135 case AMDGPU::S_OR_B32_term:
2136 // This is only a terminator to get the correct spill code placement during
2137 // register allocation.
2138 MI.setDesc(get(AMDGPU::S_OR_B32));
2139 break;
2140
2141 case AMDGPU::S_ANDN2_B64_term:
2142 // This is only a terminator to get the correct spill code placement during
2143 // register allocation.
2144 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2145 break;
2146
2147 case AMDGPU::S_ANDN2_B32_term:
2148 // This is only a terminator to get the correct spill code placement during
2149 // register allocation.
2150 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2151 break;
2152
2153 case AMDGPU::S_AND_B64_term:
2154 // This is only a terminator to get the correct spill code placement during
2155 // register allocation.
2156 MI.setDesc(get(AMDGPU::S_AND_B64));
2157 break;
2158
2159 case AMDGPU::S_AND_B32_term:
2160 // This is only a terminator to get the correct spill code placement during
2161 // register allocation.
2162 MI.setDesc(get(AMDGPU::S_AND_B32));
2163 break;
2164
2165 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2166 // This is only a terminator to get the correct spill code placement during
2167 // register allocation.
2168 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2169 break;
2170
2171 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2172 // This is only a terminator to get the correct spill code placement during
2173 // register allocation.
2174 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2175 break;
2176
2177 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2178 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2179 break;
2180
2181 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2182 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2183 break;
2184
2185 case AMDGPU::V_MOV_B64_PSEUDO: {
2186 Register Dst = MI.getOperand(0).getReg();
2187 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2188 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2189
2190 const MachineOperand &SrcOp = MI.getOperand(1);
2191 // FIXME: Will this work for 64-bit floating point immediates?
2192 assert(!SrcOp.isFPImm());
2193 if (ST.hasMovB64()) {
2194 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2195 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2196 isUInt<32>(SrcOp.getImm()))
2197 break;
2198 }
2199 if (SrcOp.isImm()) {
2200 APInt Imm(64, SrcOp.getImm());
2201 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2202 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2203 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2204 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2206 .addImm(Lo.getSExtValue())
2208 .addImm(Lo.getSExtValue())
2209 .addImm(0) // op_sel_lo
2210 .addImm(0) // op_sel_hi
2211 .addImm(0) // neg_lo
2212 .addImm(0) // neg_hi
2213 .addImm(0); // clamp
2214 } else {
2215 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2216 .addImm(Lo.getSExtValue())
2218 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2219 .addImm(Hi.getSExtValue())
2221 }
2222 } else {
2223 assert(SrcOp.isReg());
2224 if (ST.hasPkMovB32() &&
2225 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2226 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2227 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2228 .addReg(SrcOp.getReg())
2230 .addReg(SrcOp.getReg())
2231 .addImm(0) // op_sel_lo
2232 .addImm(0) // op_sel_hi
2233 .addImm(0) // neg_lo
2234 .addImm(0) // neg_hi
2235 .addImm(0); // clamp
2236 } else {
2237 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2238 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2240 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2241 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2243 }
2244 }
2245 MI.eraseFromParent();
2246 break;
2247 }
2248 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2250 break;
2251 }
2252 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2253 const MachineOperand &SrcOp = MI.getOperand(1);
2254 assert(!SrcOp.isFPImm());
2255 APInt Imm(64, SrcOp.getImm());
2256 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2257 MI.setDesc(get(AMDGPU::S_MOV_B64));
2258 break;
2259 }
2260
2261 Register Dst = MI.getOperand(0).getReg();
2262 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2263 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2264
2265 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2266 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2267 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2268 .addImm(Lo.getSExtValue())
2270 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2271 .addImm(Hi.getSExtValue())
2273 MI.eraseFromParent();
2274 break;
2275 }
2276 case AMDGPU::V_SET_INACTIVE_B32: {
2277 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2278 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2279 // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2280 // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2281 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2282 .add(MI.getOperand(1));
2283 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2284 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2285 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2286 .add(MI.getOperand(2));
2287 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2288 .addReg(Exec);
2289 MI.eraseFromParent();
2290 break;
2291 }
2292 case AMDGPU::V_SET_INACTIVE_B64: {
2293 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2294 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2295 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2296 MI.getOperand(0).getReg())
2297 .add(MI.getOperand(1));
2298 expandPostRAPseudo(*Copy);
2299 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2300 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2301 Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2302 MI.getOperand(0).getReg())
2303 .add(MI.getOperand(2));
2304 expandPostRAPseudo(*Copy);
2305 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2306 .addReg(Exec);
2307 MI.eraseFromParent();
2308 break;
2309 }
2310 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2311 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2317 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2318 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2319 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2320 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2321 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2324 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2325 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2326 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2327 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2328 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2329 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2330 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2331 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2332 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2333 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2334 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2335 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2336 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2337 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2338 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2339 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2340
2341 unsigned Opc;
2342 if (RI.hasVGPRs(EltRC)) {
2343 Opc = AMDGPU::V_MOVRELD_B32_e32;
2344 } else {
2345 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2346 : AMDGPU::S_MOVRELD_B32;
2347 }
2348
2349 const MCInstrDesc &OpDesc = get(Opc);
2350 Register VecReg = MI.getOperand(0).getReg();
2351 bool IsUndef = MI.getOperand(1).isUndef();
2352 unsigned SubReg = MI.getOperand(3).getImm();
2353 assert(VecReg == MI.getOperand(1).getReg());
2354
2356 BuildMI(MBB, MI, DL, OpDesc)
2357 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2358 .add(MI.getOperand(2))
2360 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2361
2362 const int ImpDefIdx =
2363 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2364 const int ImpUseIdx = ImpDefIdx + 1;
2365 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2366 MI.eraseFromParent();
2367 break;
2368 }
2369 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2370 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2371 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2372 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2373 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2374 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2375 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2376 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2377 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2378 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2379 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2380 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2382 Register VecReg = MI.getOperand(0).getReg();
2383 bool IsUndef = MI.getOperand(1).isUndef();
2384 Register Idx = MI.getOperand(3).getReg();
2385 Register SubReg = MI.getOperand(4).getImm();
2386
2387 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2388 .addReg(Idx)
2390 SetOn->getOperand(3).setIsUndef();
2391
2392 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2394 BuildMI(MBB, MI, DL, OpDesc)
2395 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2396 .add(MI.getOperand(2))
2398 .addReg(VecReg,
2399 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2400
2401 const int ImpDefIdx =
2402 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2403 const int ImpUseIdx = ImpDefIdx + 1;
2404 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2405
2406 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2407
2408 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2409
2410 MI.eraseFromParent();
2411 break;
2412 }
2413 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2414 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2415 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2416 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2417 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2418 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2419 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2420 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2421 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2422 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2423 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2424 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2426 Register Dst = MI.getOperand(0).getReg();
2427 Register VecReg = MI.getOperand(1).getReg();
2428 bool IsUndef = MI.getOperand(1).isUndef();
2429 Register Idx = MI.getOperand(2).getReg();
2430 Register SubReg = MI.getOperand(3).getImm();
2431
2432 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2433 .addReg(Idx)
2435 SetOn->getOperand(3).setIsUndef();
2436
2437 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2438 .addDef(Dst)
2439 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2440 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2441
2442 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2443
2444 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2445
2446 MI.eraseFromParent();
2447 break;
2448 }
2449 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2450 MachineFunction &MF = *MBB.getParent();
2451 Register Reg = MI.getOperand(0).getReg();
2452 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2453 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2454 MachineOperand OpLo = MI.getOperand(1);
2455 MachineOperand OpHi = MI.getOperand(2);
2456
2457 // Create a bundle so these instructions won't be re-ordered by the
2458 // post-RA scheduler.
2459 MIBundleBuilder Bundler(MBB, MI);
2460 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2461
2462 // What we want here is an offset from the value returned by s_getpc (which
2463 // is the address of the s_add_u32 instruction) to the global variable, but
2464 // since the encoding of $symbol starts 4 bytes after the start of the
2465 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2466 // small. This requires us to add 4 to the global variable offset in order
2467 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2468 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2469 // instruction.
2470
2471 int64_t Adjust = 0;
2472 if (ST.hasGetPCZeroExtension()) {
2473 // Fix up hardware that does not sign-extend the 48-bit PC value by
2474 // inserting: s_sext_i32_i16 reghi, reghi
2475 Bundler.append(
2476 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2477 Adjust += 4;
2478 }
2479
2480 if (OpLo.isGlobal())
2481 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2482 Bundler.append(
2483 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2484
2485 if (OpHi.isGlobal())
2486 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2487 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2488 .addReg(RegHi)
2489 .add(OpHi));
2490
2491 finalizeBundle(MBB, Bundler.begin());
2492
2493 MI.eraseFromParent();
2494 break;
2495 }
2496 case AMDGPU::ENTER_STRICT_WWM: {
2497 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2498 // Whole Wave Mode is entered.
2499 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2500 : AMDGPU::S_OR_SAVEEXEC_B64));
2501 break;
2502 }
2503 case AMDGPU::ENTER_STRICT_WQM: {
2504 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2505 // STRICT_WQM is entered.
2506 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2507 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2508 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2509 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2510 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2511
2512 MI.eraseFromParent();
2513 break;
2514 }
2515 case AMDGPU::EXIT_STRICT_WWM:
2516 case AMDGPU::EXIT_STRICT_WQM: {
2517 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2518 // WWM/STICT_WQM is exited.
2519 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2520 break;
2521 }
2522 case AMDGPU::SI_RETURN: {
2523 const MachineFunction *MF = MBB.getParent();
2524 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2525 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2526 // Hiding the return address use with SI_RETURN may lead to extra kills in
2527 // the function and missing live-ins. We are fine in practice because callee
2528 // saved register handling ensures the register value is restored before
2529 // RET, but we need the undef flag here to appease the MachineVerifier
2530 // liveness checks.
2532 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2533 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2534
2535 MIB.copyImplicitOps(MI);
2536 MI.eraseFromParent();
2537 break;
2538 }
2539
2540 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2541 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2542 MI.setDesc(get(AMDGPU::S_MUL_U64));
2543 break;
2544
2545 case AMDGPU::S_GETPC_B64_pseudo:
2546 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2547 if (ST.hasGetPCZeroExtension()) {
2548 Register Dst = MI.getOperand(0).getReg();
2549 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2550 // Fix up hardware that does not sign-extend the 48-bit PC value by
2551 // inserting: s_sext_i32_i16 dsthi, dsthi
2552 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2553 DstHi)
2554 .addReg(DstHi);
2555 }
2556 break;
2557 }
2558 return true;
2559}
2560
2563 unsigned SubIdx, const MachineInstr &Orig,
2564 const TargetRegisterInfo &RI) const {
2565
2566 // Try shrinking the instruction to remat only the part needed for current
2567 // context.
2568 // TODO: Handle more cases.
2569 unsigned Opcode = Orig.getOpcode();
2570 switch (Opcode) {
2571 case AMDGPU::S_LOAD_DWORDX16_IMM:
2572 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2573 if (SubIdx != 0)
2574 break;
2575
2576 if (I == MBB.end())
2577 break;
2578
2579 if (I->isBundled())
2580 break;
2581
2582 // Look for a single use of the register that is also a subreg.
2583 Register RegToFind = Orig.getOperand(0).getReg();
2584 MachineOperand *UseMO = nullptr;
2585 for (auto &CandMO : I->operands()) {
2586 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2587 continue;
2588 if (UseMO) {
2589 UseMO = nullptr;
2590 break;
2591 }
2592 UseMO = &CandMO;
2593 }
2594 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2595 break;
2596
2597 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2598 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2599
2602 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2603
2604 unsigned NewOpcode = -1;
2605 if (SubregSize == 256)
2606 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2607 else if (SubregSize == 128)
2608 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2609 else
2610 break;
2611
2612 const MCInstrDesc &TID = get(NewOpcode);
2613 const TargetRegisterClass *NewRC =
2614 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2615 MRI.setRegClass(DestReg, NewRC);
2616
2617 UseMO->setReg(DestReg);
2618 UseMO->setSubReg(AMDGPU::NoSubRegister);
2619
2620 // Use a smaller load with the desired size, possibly with updated offset.
2621 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2622 MI->setDesc(TID);
2623 MI->getOperand(0).setReg(DestReg);
2624 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2625 if (Offset) {
2626 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2627 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2628 OffsetMO->setImm(FinalOffset);
2629 }
2631 for (const MachineMemOperand *MemOp : Orig.memoperands())
2632 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2633 SubregSize / 8));
2634 MI->setMemRefs(*MF, NewMMOs);
2635
2636 MBB.insert(I, MI);
2637 return;
2638 }
2639
2640 default:
2641 break;
2642 }
2643
2644 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2645}
2646
2647std::pair<MachineInstr*, MachineInstr*>
2649 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2650
2651 if (ST.hasMovB64() &&
2653 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2654 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2655 return std::pair(&MI, nullptr);
2656 }
2657
2658 MachineBasicBlock &MBB = *MI.getParent();
2662 Register Dst = MI.getOperand(0).getReg();
2663 unsigned Part = 0;
2664 MachineInstr *Split[2];
2665
2666 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2667 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2668 if (Dst.isPhysical()) {
2669 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2670 } else {
2671 assert(MRI.isSSA());
2672 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2673 MovDPP.addDef(Tmp);
2674 }
2675
2676 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2677 const MachineOperand &SrcOp = MI.getOperand(I);
2678 assert(!SrcOp.isFPImm());
2679 if (SrcOp.isImm()) {
2680 APInt Imm(64, SrcOp.getImm());
2681 Imm.ashrInPlace(Part * 32);
2682 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2683 } else {
2684 assert(SrcOp.isReg());
2685 Register Src = SrcOp.getReg();
2686 if (Src.isPhysical())
2687 MovDPP.addReg(RI.getSubReg(Src, Sub));
2688 else
2689 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2690 }
2691 }
2692
2693 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2694 MovDPP.addImm(MO.getImm());
2695
2696 Split[Part] = MovDPP;
2697 ++Part;
2698 }
2699
2700 if (Dst.isVirtual())
2701 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2702 .addReg(Split[0]->getOperand(0).getReg())
2703 .addImm(AMDGPU::sub0)
2704 .addReg(Split[1]->getOperand(0).getReg())
2705 .addImm(AMDGPU::sub1);
2706
2707 MI.eraseFromParent();
2708 return std::pair(Split[0], Split[1]);
2709}
2710
2711std::optional<DestSourcePair>
2713 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2714 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2715
2716 return std::nullopt;
2717}
2718
2720 MachineOperand &Src0,
2721 unsigned Src0OpName,
2722 MachineOperand &Src1,
2723 unsigned Src1OpName) const {
2724 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2725 if (!Src0Mods)
2726 return false;
2727
2728 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2729 assert(Src1Mods &&
2730 "All commutable instructions have both src0 and src1 modifiers");
2731
2732 int Src0ModsVal = Src0Mods->getImm();
2733 int Src1ModsVal = Src1Mods->getImm();
2734
2735 Src1Mods->setImm(Src0ModsVal);
2736 Src0Mods->setImm(Src1ModsVal);
2737 return true;
2738}
2739
2741 MachineOperand &RegOp,
2742 MachineOperand &NonRegOp) {
2743 Register Reg = RegOp.getReg();
2744 unsigned SubReg = RegOp.getSubReg();
2745 bool IsKill = RegOp.isKill();
2746 bool IsDead = RegOp.isDead();
2747 bool IsUndef = RegOp.isUndef();
2748 bool IsDebug = RegOp.isDebug();
2749
2750 if (NonRegOp.isImm())
2751 RegOp.ChangeToImmediate(NonRegOp.getImm());
2752 else if (NonRegOp.isFI())
2753 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2754 else if (NonRegOp.isGlobal()) {
2755 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2756 NonRegOp.getTargetFlags());
2757 } else
2758 return nullptr;
2759
2760 // Make sure we don't reinterpret a subreg index in the target flags.
2761 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2762
2763 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2764 NonRegOp.setSubReg(SubReg);
2765
2766 return &MI;
2767}
2768
2770 unsigned Src0Idx,
2771 unsigned Src1Idx) const {
2772 assert(!NewMI && "this should never be used");
2773
2774 unsigned Opc = MI.getOpcode();
2775 int CommutedOpcode = commuteOpcode(Opc);
2776 if (CommutedOpcode == -1)
2777 return nullptr;
2778
2779 if (Src0Idx > Src1Idx)
2780 std::swap(Src0Idx, Src1Idx);
2781
2782 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2783 static_cast<int>(Src0Idx) &&
2784 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2785 static_cast<int>(Src1Idx) &&
2786 "inconsistency with findCommutedOpIndices");
2787
2788 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2789 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2790
2791 MachineInstr *CommutedMI = nullptr;
2792 if (Src0.isReg() && Src1.isReg()) {
2793 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2794 // Be sure to copy the source modifiers to the right place.
2795 CommutedMI
2796 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2797 }
2798
2799 } else if (Src0.isReg() && !Src1.isReg()) {
2800 // src0 should always be able to support any operand type, so no need to
2801 // check operand legality.
2802 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2803 } else if (!Src0.isReg() && Src1.isReg()) {
2804 if (isOperandLegal(MI, Src1Idx, &Src0))
2805 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2806 } else {
2807 // FIXME: Found two non registers to commute. This does happen.
2808 return nullptr;
2809 }
2810
2811 if (CommutedMI) {
2812 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2813 Src1, AMDGPU::OpName::src1_modifiers);
2814
2815 CommutedMI->setDesc(get(CommutedOpcode));
2816 }
2817
2818 return CommutedMI;
2819}
2820
2821// This needs to be implemented because the source modifiers may be inserted
2822// between the true commutable operands, and the base
2823// TargetInstrInfo::commuteInstruction uses it.
2825 unsigned &SrcOpIdx0,
2826 unsigned &SrcOpIdx1) const {
2827 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2828}
2829
2831 unsigned &SrcOpIdx0,
2832 unsigned &SrcOpIdx1) const {
2833 if (!Desc.isCommutable())
2834 return false;
2835
2836 unsigned Opc = Desc.getOpcode();
2837 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2838 if (Src0Idx == -1)
2839 return false;
2840
2841 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2842 if (Src1Idx == -1)
2843 return false;
2844
2845 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2846}
2847
2849 int64_t BrOffset) const {
2850 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2851 // block is unanalyzable.
2852 assert(BranchOp != AMDGPU::S_SETPC_B64);
2853
2854 // Convert to dwords.
2855 BrOffset /= 4;
2856
2857 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2858 // from the next instruction.
2859 BrOffset -= 1;
2860
2861 return isIntN(BranchOffsetBits, BrOffset);
2862}
2863
2866 return MI.getOperand(0).getMBB();
2867}
2868
2870 for (const MachineInstr &MI : MBB->terminators()) {
2871 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
2872 MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2873 MI.getOpcode() == AMDGPU::SI_LOOP)
2874 return true;
2875 }
2876 return false;
2877}
2878
2880 MachineBasicBlock &DestBB,
2881 MachineBasicBlock &RestoreBB,
2882 const DebugLoc &DL, int64_t BrOffset,
2883 RegScavenger *RS) const {
2884 assert(RS && "RegScavenger required for long branching");
2885 assert(MBB.empty() &&
2886 "new block should be inserted for expanding unconditional branch");
2887 assert(MBB.pred_size() == 1);
2888 assert(RestoreBB.empty() &&
2889 "restore block should be inserted for restoring clobbered registers");
2890
2894
2895 // FIXME: Virtual register workaround for RegScavenger not working with empty
2896 // blocks.
2897 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2898
2899 auto I = MBB.end();
2900
2901 // We need to compute the offset relative to the instruction immediately after
2902 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2903 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2904
2905 auto &MCCtx = MF->getContext();
2906 MCSymbol *PostGetPCLabel =
2907 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2908 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2909
2910 MCSymbol *OffsetLo =
2911 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2912 MCSymbol *OffsetHi =
2913 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2914 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2915 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2916 .addReg(PCReg, 0, AMDGPU::sub0)
2917 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2918 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2919 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2920 .addReg(PCReg, 0, AMDGPU::sub1)
2921 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2922
2923 // Insert the indirect branch after the other terminator.
2924 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2925 .addReg(PCReg);
2926
2927 // If a spill is needed for the pc register pair, we need to insert a spill
2928 // restore block right before the destination block, and insert a short branch
2929 // into the old destination block's fallthrough predecessor.
2930 // e.g.:
2931 //
2932 // s_cbranch_scc0 skip_long_branch:
2933 //
2934 // long_branch_bb:
2935 // spill s[8:9]
2936 // s_getpc_b64 s[8:9]
2937 // s_add_u32 s8, s8, restore_bb
2938 // s_addc_u32 s9, s9, 0
2939 // s_setpc_b64 s[8:9]
2940 //
2941 // skip_long_branch:
2942 // foo;
2943 //
2944 // .....
2945 //
2946 // dest_bb_fallthrough_predecessor:
2947 // bar;
2948 // s_branch dest_bb
2949 //
2950 // restore_bb:
2951 // restore s[8:9]
2952 // fallthrough dest_bb
2953 ///
2954 // dest_bb:
2955 // buzz;
2956
2957 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2958 Register Scav;
2959
2960 // If we've previously reserved a register for long branches
2961 // avoid running the scavenger and just use those registers
2962 if (LongBranchReservedReg) {
2963 RS->enterBasicBlock(MBB);
2964 Scav = LongBranchReservedReg;
2965 } else {
2967 Scav = RS->scavengeRegisterBackwards(
2968 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2969 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2970 }
2971 if (Scav) {
2972 RS->setRegUsed(Scav);
2973 MRI.replaceRegWith(PCReg, Scav);
2974 MRI.clearVirtRegs();
2975 } else {
2976 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2977 // SGPR spill.
2978 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2979 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2980 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2981 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2982 MRI.clearVirtRegs();
2983 }
2984
2985 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2986 // Now, the distance could be defined.
2988 MCSymbolRefExpr::create(DestLabel, MCCtx),
2989 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2990 // Add offset assignments.
2991 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2992 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2993 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2994 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2995}
2996
2997unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2998 switch (Cond) {
2999 case SIInstrInfo::SCC_TRUE:
3000 return AMDGPU::S_CBRANCH_SCC1;
3001 case SIInstrInfo::SCC_FALSE:
3002 return AMDGPU::S_CBRANCH_SCC0;
3003 case SIInstrInfo::VCCNZ:
3004 return AMDGPU::S_CBRANCH_VCCNZ;
3005 case SIInstrInfo::VCCZ:
3006 return AMDGPU::S_CBRANCH_VCCZ;
3007 case SIInstrInfo::EXECNZ:
3008 return AMDGPU::S_CBRANCH_EXECNZ;
3009 case SIInstrInfo::EXECZ:
3010 return AMDGPU::S_CBRANCH_EXECZ;
3011 default:
3012 llvm_unreachable("invalid branch predicate");
3013 }
3014}
3015
3016SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3017 switch (Opcode) {
3018 case AMDGPU::S_CBRANCH_SCC0:
3019 return SCC_FALSE;
3020 case AMDGPU::S_CBRANCH_SCC1:
3021 return SCC_TRUE;
3022 case AMDGPU::S_CBRANCH_VCCNZ:
3023 return VCCNZ;
3024 case AMDGPU::S_CBRANCH_VCCZ:
3025 return VCCZ;
3026 case AMDGPU::S_CBRANCH_EXECNZ:
3027 return EXECNZ;
3028 case AMDGPU::S_CBRANCH_EXECZ:
3029 return EXECZ;
3030 default:
3031 return INVALID_BR;
3032 }
3033}
3034
3038 MachineBasicBlock *&FBB,
3040 bool AllowModify) const {
3041 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3042 // Unconditional Branch
3043 TBB = I->getOperand(0).getMBB();
3044 return false;
3045 }
3046
3047 MachineBasicBlock *CondBB = nullptr;
3048
3049 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
3050 CondBB = I->getOperand(1).getMBB();
3051 Cond.push_back(I->getOperand(0));
3052 } else {
3053 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3054 if (Pred == INVALID_BR)
3055 return true;
3056
3057 CondBB = I->getOperand(0).getMBB();
3058 Cond.push_back(MachineOperand::CreateImm(Pred));
3059 Cond.push_back(I->getOperand(1)); // Save the branch register.
3060 }
3061 ++I;
3062
3063 if (I == MBB.end()) {
3064 // Conditional branch followed by fall-through.
3065 TBB = CondBB;
3066 return false;
3067 }
3068
3069 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3070 TBB = CondBB;
3071 FBB = I->getOperand(0).getMBB();
3072 return false;
3073 }
3074
3075 return true;
3076}
3077
3079 MachineBasicBlock *&FBB,
3081 bool AllowModify) const {
3083 auto E = MBB.end();
3084 if (I == E)
3085 return false;
3086
3087 // Skip over the instructions that are artificially terminators for special
3088 // exec management.
3089 while (I != E && !I->isBranch() && !I->isReturn()) {
3090 switch (I->getOpcode()) {
3091 case AMDGPU::S_MOV_B64_term:
3092 case AMDGPU::S_XOR_B64_term:
3093 case AMDGPU::S_OR_B64_term:
3094 case AMDGPU::S_ANDN2_B64_term:
3095 case AMDGPU::S_AND_B64_term:
3096 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3097 case AMDGPU::S_MOV_B32_term:
3098 case AMDGPU::S_XOR_B32_term:
3099 case AMDGPU::S_OR_B32_term:
3100 case AMDGPU::S_ANDN2_B32_term:
3101 case AMDGPU::S_AND_B32_term:
3102 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3103 break;
3104 case AMDGPU::SI_IF:
3105 case AMDGPU::SI_ELSE:
3106 case AMDGPU::SI_KILL_I1_TERMINATOR:
3107 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3108 // FIXME: It's messy that these need to be considered here at all.
3109 return true;
3110 default:
3111 llvm_unreachable("unexpected non-branch terminator inst");
3112 }
3113
3114 ++I;
3115 }
3116
3117 if (I == E)
3118 return false;
3119
3120 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3121}
3122
3124 int *BytesRemoved) const {
3125 unsigned Count = 0;
3126 unsigned RemovedSize = 0;
3128 // Skip over artificial terminators when removing instructions.
3129 if (MI.isBranch() || MI.isReturn()) {
3130 RemovedSize += getInstSizeInBytes(MI);
3131 MI.eraseFromParent();
3132 ++Count;
3133 }
3134 }
3135
3136 if (BytesRemoved)
3137 *BytesRemoved = RemovedSize;
3138
3139 return Count;
3140}
3141
3142// Copy the flags onto the implicit condition register operand.
3144 const MachineOperand &OrigCond) {
3145 CondReg.setIsUndef(OrigCond.isUndef());
3146 CondReg.setIsKill(OrigCond.isKill());
3147}
3148
3151 MachineBasicBlock *FBB,
3153 const DebugLoc &DL,
3154 int *BytesAdded) const {
3155 if (!FBB && Cond.empty()) {
3156 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3157 .addMBB(TBB);
3158 if (BytesAdded)
3159 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3160 return 1;
3161 }
3162
3163 if(Cond.size() == 1 && Cond[0].isReg()) {
3164 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
3165 .add(Cond[0])
3166 .addMBB(TBB);
3167 return 1;
3168 }
3169
3170 assert(TBB && Cond[0].isImm());
3171
3172 unsigned Opcode
3173 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3174
3175 if (!FBB) {
3176 MachineInstr *CondBr =
3177 BuildMI(&MBB, DL, get(Opcode))
3178 .addMBB(TBB);
3179
3180 // Copy the flags onto the implicit condition register operand.
3181 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3182 fixImplicitOperands(*CondBr);
3183
3184 if (BytesAdded)
3185 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3186 return 1;
3187 }
3188
3189 assert(TBB && FBB);
3190
3191 MachineInstr *CondBr =
3192 BuildMI(&MBB, DL, get(Opcode))
3193 .addMBB(TBB);
3194 fixImplicitOperands(*CondBr);
3195 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3196 .addMBB(FBB);
3197
3198 MachineOperand &CondReg = CondBr->getOperand(1);
3199 CondReg.setIsUndef(Cond[1].isUndef());
3200 CondReg.setIsKill(Cond[1].isKill());
3201
3202 if (BytesAdded)
3203 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3204
3205 return 2;
3206}
3207
3210 if (Cond.size() != 2) {
3211 return true;
3212 }
3213
3214 if (Cond[0].isImm()) {
3215 Cond[0].setImm(-Cond[0].getImm());
3216 return false;
3217 }
3218
3219 return true;
3220}
3221
3224 Register DstReg, Register TrueReg,
3225 Register FalseReg, int &CondCycles,
3226 int &TrueCycles, int &FalseCycles) const {
3227 switch (Cond[0].getImm()) {
3228 case VCCNZ:
3229 case VCCZ: {
3231 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3232 if (MRI.getRegClass(FalseReg) != RC)
3233 return false;
3234
3235 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3236 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3237
3238 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3239 return RI.hasVGPRs(RC) && NumInsts <= 6;
3240 }
3241 case SCC_TRUE:
3242 case SCC_FALSE: {
3243 // FIXME: We could insert for VGPRs if we could replace the original compare
3244 // with a vector one.
3246 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3247 if (MRI.getRegClass(FalseReg) != RC)
3248 return false;
3249
3250 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3251
3252 // Multiples of 8 can do s_cselect_b64
3253 if (NumInsts % 2 == 0)
3254 NumInsts /= 2;
3255
3256 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3257 return RI.isSGPRClass(RC);
3258 }
3259 default:
3260 return false;
3261 }
3262}
3263
3267 Register TrueReg, Register FalseReg) const {
3268 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3269 if (Pred == VCCZ || Pred == SCC_FALSE) {
3270 Pred = static_cast<BranchPredicate>(-Pred);
3271 std::swap(TrueReg, FalseReg);
3272 }
3273
3275 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3276 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3277
3278 if (DstSize == 32) {
3280 if (Pred == SCC_TRUE) {
3281 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3282 .addReg(TrueReg)
3283 .addReg(FalseReg);
3284 } else {
3285 // Instruction's operands are backwards from what is expected.
3286 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3287 .addReg(FalseReg)
3288 .addReg(TrueReg);
3289 }
3290
3291 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3292 return;
3293 }
3294
3295 if (DstSize == 64 && Pred == SCC_TRUE) {
3297 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3298 .addReg(TrueReg)
3299 .addReg(FalseReg);
3300
3301 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3302 return;
3303 }
3304
3305 static const int16_t Sub0_15[] = {
3306 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3307 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3308 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3309 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3310 };
3311
3312 static const int16_t Sub0_15_64[] = {
3313 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3314 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3315 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3316 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3317 };
3318
3319 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3320 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3321 const int16_t *SubIndices = Sub0_15;
3322 int NElts = DstSize / 32;
3323
3324 // 64-bit select is only available for SALU.
3325 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3326 if (Pred == SCC_TRUE) {
3327 if (NElts % 2) {
3328 SelOp = AMDGPU::S_CSELECT_B32;
3329 EltRC = &AMDGPU::SGPR_32RegClass;
3330 } else {
3331 SelOp = AMDGPU::S_CSELECT_B64;
3332 EltRC = &AMDGPU::SGPR_64RegClass;
3333 SubIndices = Sub0_15_64;
3334 NElts /= 2;
3335 }
3336 }
3337
3339 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3340
3341 I = MIB->getIterator();
3342
3344 for (int Idx = 0; Idx != NElts; ++Idx) {
3345 Register DstElt = MRI.createVirtualRegister(EltRC);
3346 Regs.push_back(DstElt);
3347
3348 unsigned SubIdx = SubIndices[Idx];
3349
3351 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3352 Select =
3353 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3354 .addReg(FalseReg, 0, SubIdx)
3355 .addReg(TrueReg, 0, SubIdx);
3356 } else {
3357 Select =
3358 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3359 .addReg(TrueReg, 0, SubIdx)
3360 .addReg(FalseReg, 0, SubIdx);
3361 }
3362
3363 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3365
3366 MIB.addReg(DstElt)
3367 .addImm(SubIdx);
3368 }
3369}
3370
3372 switch (MI.getOpcode()) {
3373 case AMDGPU::V_MOV_B32_e32:
3374 case AMDGPU::V_MOV_B32_e64:
3375 case AMDGPU::V_MOV_B64_PSEUDO:
3376 case AMDGPU::V_MOV_B64_e32:
3377 case AMDGPU::V_MOV_B64_e64:
3378 case AMDGPU::S_MOV_B32:
3379 case AMDGPU::S_MOV_B64:
3380 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3381 case AMDGPU::COPY:
3382 case AMDGPU::WWM_COPY:
3383 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3384 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3385 case AMDGPU::V_ACCVGPR_MOV_B32:
3386 return true;
3387 default:
3388 return false;
3389 }
3390}
3391
3392static constexpr unsigned ModifierOpNames[] = {
3393 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3394 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3395 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3396
3398 unsigned Opc = MI.getOpcode();
3399 for (unsigned Name : reverse(ModifierOpNames)) {
3401 if (Idx >= 0)
3402 MI.removeOperand(Idx);
3403 }
3404}
3405
3407 Register Reg, MachineRegisterInfo *MRI) const {
3408 if (!MRI->hasOneNonDBGUse(Reg))
3409 return false;
3410
3411 switch (DefMI.getOpcode()) {
3412 default:
3413 return false;
3414 case AMDGPU::V_MOV_B64_e32:
3415 case AMDGPU::S_MOV_B64:
3416 case AMDGPU::V_MOV_B64_PSEUDO:
3417 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3418 case AMDGPU::V_MOV_B32_e32:
3419 case AMDGPU::S_MOV_B32:
3420 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3421 break;
3422 }
3423
3424 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3425 assert(ImmOp);
3426 // FIXME: We could handle FrameIndex values here.
3427 if (!ImmOp->isImm())
3428 return false;
3429
3430 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3431 int64_t Imm = ImmOp->getImm();
3432 switch (UseOp.getSubReg()) {
3433 default:
3434 return Imm;
3435 case AMDGPU::sub0:
3436 return Lo_32(Imm);
3437 case AMDGPU::sub1:
3438 return Hi_32(Imm);
3439 case AMDGPU::lo16:
3440 return APInt(16, Imm).getSExtValue();
3441 case AMDGPU::hi16:
3442 return APInt(32, Imm).ashr(16).getSExtValue();
3443 case AMDGPU::sub1_lo16:
3444 return APInt(16, Hi_32(Imm)).getSExtValue();
3445 case AMDGPU::sub1_hi16:
3446 return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();
3447 }
3448 };
3449
3450 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3451
3452 unsigned Opc = UseMI.getOpcode();
3453 if (Opc == AMDGPU::COPY) {
3454 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3455
3456 Register DstReg = UseMI.getOperand(0).getReg();
3457 unsigned OpSize = getOpSize(UseMI, 0);
3458 bool Is16Bit = OpSize == 2;
3459 bool Is64Bit = OpSize == 8;
3460 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3461 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3462 : AMDGPU::V_MOV_B32_e32
3463 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3464 : AMDGPU::S_MOV_B32;
3465 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
3466
3467 if (RI.isAGPR(*MRI, DstReg)) {
3468 if (Is64Bit || !isInlineConstant(Imm))
3469 return false;
3470 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3471 }
3472
3473 if (Is16Bit) {
3474 if (isVGPRCopy)
3475 return false; // Do not clobber vgpr_hi16
3476
3477 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3478 return false;
3479
3480 UseMI.getOperand(0).setSubReg(0);
3481 if (DstReg.isPhysical()) {
3482 DstReg = RI.get32BitRegister(DstReg);
3483 UseMI.getOperand(0).setReg(DstReg);
3484 }
3485 assert(UseMI.getOperand(1).getReg().isVirtual());
3486 }
3487
3488 const MCInstrDesc &NewMCID = get(NewOpc);
3489 if (DstReg.isPhysical() &&
3490 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3491 return false;
3492
3493 UseMI.setDesc(NewMCID);
3494 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3495 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3496 return true;
3497 }
3498
3499 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3500 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3501 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3502 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3503 Opc == AMDGPU::V_FMAC_F16_t16_e64) {
3504 // Don't fold if we are using source or output modifiers. The new VOP2
3505 // instructions don't have them.
3507 return false;
3508
3509 // If this is a free constant, there's no reason to do this.
3510 // TODO: We could fold this here instead of letting SIFoldOperands do it
3511 // later.
3512 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3513
3514 // Any src operand can be used for the legality check.
3515 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3516 return false;
3517
3518 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3519 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3520 bool IsFMA =
3521 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3522 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3523 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3524 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3525 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3526
3527 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3528 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3529 (Src1->isReg() && Src1->getReg() == Reg)) {
3530 MachineOperand *RegSrc =
3531 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3532 if (!RegSrc->isReg())
3533 return false;
3534 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3535 ST.getConstantBusLimit(Opc) < 2)
3536 return false;
3537
3538 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3539 return false;
3540
3541 // If src2 is also a literal constant then we have to choose which one to
3542 // fold. In general it is better to choose madak so that the other literal
3543 // can be materialized in an sgpr instead of a vgpr:
3544 // s_mov_b32 s0, literal
3545 // v_madak_f32 v0, s0, v0, literal
3546 // Instead of:
3547 // v_mov_b32 v1, literal
3548 // v_madmk_f32 v0, v0, literal, v1
3549 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3550 if (Def && Def->isMoveImmediate() &&
3551 !isInlineConstant(Def->getOperand(1)))
3552 return false;
3553
3554 unsigned NewOpc =
3555 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3556 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3557 : AMDGPU::V_FMAMK_F16)
3558 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3559 if (pseudoToMCOpcode(NewOpc) == -1)
3560 return false;
3561
3562 // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3563 // would also require restricting their register classes. For now
3564 // just bail out.
3565 if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
3566 return false;
3567
3568 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3569
3570 // FIXME: This would be a lot easier if we could return a new instruction
3571 // instead of having to modify in place.
3572
3573 Register SrcReg = RegSrc->getReg();
3574 unsigned SrcSubReg = RegSrc->getSubReg();
3575 Src0->setReg(SrcReg);
3576 Src0->setSubReg(SrcSubReg);
3577 Src0->setIsKill(RegSrc->isKill());
3578
3579 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3580 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3581 Opc == AMDGPU::V_FMAC_F16_e64)
3582 UseMI.untieRegOperand(
3583 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3584
3585 Src1->ChangeToImmediate(Imm);
3586
3588 UseMI.setDesc(get(NewOpc));
3589
3590 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3591 if (DeleteDef)
3592 DefMI.eraseFromParent();
3593
3594 return true;
3595 }
3596
3597 // Added part is the constant: Use v_madak_{f16, f32}.
3598 if (Src2->isReg() && Src2->getReg() == Reg) {
3599 if (ST.getConstantBusLimit(Opc) < 2) {
3600 // Not allowed to use constant bus for another operand.
3601 // We can however allow an inline immediate as src0.
3602 bool Src0Inlined = false;
3603 if (Src0->isReg()) {
3604 // Try to inline constant if possible.
3605 // If the Def moves immediate and the use is single
3606 // We are saving VGPR here.
3607 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3608 if (Def && Def->isMoveImmediate() &&
3609 isInlineConstant(Def->getOperand(1)) &&
3610 MRI->hasOneUse(Src0->getReg())) {
3611 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3612 Src0Inlined = true;
3613 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3614 RI.isSGPRReg(*MRI, Src0->getReg())) {
3615 return false;
3616 }
3617 // VGPR is okay as Src0 - fallthrough
3618 }
3619
3620 if (Src1->isReg() && !Src0Inlined) {
3621 // We have one slot for inlinable constant so far - try to fill it
3622 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3623 if (Def && Def->isMoveImmediate() &&
3624 isInlineConstant(Def->getOperand(1)) &&
3625 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3626 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3627 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3628 return false;
3629 // VGPR is okay as Src1 - fallthrough
3630 }
3631 }
3632
3633 unsigned NewOpc =
3634 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3635 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3636 : AMDGPU::V_FMAAK_F16)
3637 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3638 if (pseudoToMCOpcode(NewOpc) == -1)
3639 return false;
3640
3641 // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3642 // would also require restricting their register classes. For now
3643 // just bail out.
3644 if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
3645 return false;
3646
3647 // FIXME: This would be a lot easier if we could return a new instruction
3648 // instead of having to modify in place.
3649
3650 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3651 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3652 Opc == AMDGPU::V_FMAC_F16_e64)
3653 UseMI.untieRegOperand(
3654 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3655
3656 // ChangingToImmediate adds Src2 back to the instruction.
3657 Src2->ChangeToImmediate(getImmFor(*Src2));
3658
3659 // These come before src2.
3661 UseMI.setDesc(get(NewOpc));
3662 // It might happen that UseMI was commuted
3663 // and we now have SGPR as SRC1. If so 2 inlined
3664 // constant and SGPR are illegal.
3666
3667 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3668 if (DeleteDef)
3669 DefMI.eraseFromParent();
3670
3671 return true;
3672 }
3673 }
3674
3675 return false;
3676}
3677
3678static bool
3681 if (BaseOps1.size() != BaseOps2.size())
3682 return false;
3683 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3684 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3685 return false;
3686 }
3687 return true;
3688}
3689
3690static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3691 LocationSize WidthB, int OffsetB) {
3692 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3693 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3694 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3695 return LowWidth.hasValue() &&
3696 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3697}
3698
3699bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3700 const MachineInstr &MIb) const {
3701 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3702 int64_t Offset0, Offset1;
3703 LocationSize Dummy0 = 0, Dummy1 = 0;
3704 bool Offset0IsScalable, Offset1IsScalable;
3705 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3706 Dummy0, &RI) ||
3707 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3708 Dummy1, &RI))
3709 return false;
3710
3711 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3712 return false;
3713
3714 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3715 // FIXME: Handle ds_read2 / ds_write2.
3716 return false;
3717 }
3718 LocationSize Width0 = MIa.memoperands().front()->getSize();
3719 LocationSize Width1 = MIb.memoperands().front()->getSize();
3720 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3721}
3722
3724 const MachineInstr &MIb) const {
3725 assert(MIa.mayLoadOrStore() &&
3726 "MIa must load from or modify a memory location");
3727 assert(MIb.mayLoadOrStore() &&
3728 "MIb must load from or modify a memory location");
3729
3731 return false;
3732
3733 // XXX - Can we relax this between address spaces?
3734 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3735 return false;
3736
3737 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3738 return false;
3739
3740 // TODO: Should we check the address space from the MachineMemOperand? That
3741 // would allow us to distinguish objects we know don't alias based on the
3742 // underlying address space, even if it was lowered to a different one,
3743 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3744 // buffer.
3745 if (isDS(MIa)) {
3746 if (isDS(MIb))
3747 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3748
3749 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3750 }
3751
3752 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3753 if (isMUBUF(MIb) || isMTBUF(MIb))
3754 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3755
3756 if (isFLAT(MIb))
3757 return isFLATScratch(MIb);
3758
3759 return !isSMRD(MIb);
3760 }
3761
3762 if (isSMRD(MIa)) {
3763 if (isSMRD(MIb))
3764 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3765
3766 if (isFLAT(MIb))
3767 return isFLATScratch(MIb);
3768
3769 return !isMUBUF(MIb) && !isMTBUF(MIb);
3770 }
3771
3772 if (isFLAT(MIa)) {
3773 if (isFLAT(MIb)) {
3774 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3775 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3776 return true;
3777
3778 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3779 }
3780
3781 return false;
3782 }
3783
3784 return false;
3785}
3786
3788 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3789 if (Reg.isPhysical())
3790 return false;
3791 auto *Def = MRI.getUniqueVRegDef(Reg);
3792 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3793 Imm = Def->getOperand(1).getImm();
3794 if (DefMI)
3795 *DefMI = Def;
3796 return true;
3797 }
3798 return false;
3799}
3800
3801static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3802 MachineInstr **DefMI = nullptr) {
3803 if (!MO->isReg())
3804 return false;
3805 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3806 const MachineRegisterInfo &MRI = MF->getRegInfo();
3807 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3808}
3809
3811 MachineInstr &NewMI) {
3812 if (LV) {
3813 unsigned NumOps = MI.getNumOperands();
3814 for (unsigned I = 1; I < NumOps; ++I) {
3815 MachineOperand &Op = MI.getOperand(I);
3816 if (Op.isReg() && Op.isKill())
3817 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3818 }
3819 }
3820}
3821
3823 LiveVariables *LV,
3824 LiveIntervals *LIS) const {
3825 MachineBasicBlock &MBB = *MI.getParent();
3826 unsigned Opc = MI.getOpcode();
3827
3828 // Handle MFMA.
3829 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3830 if (NewMFMAOpc != -1) {
3832 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3833 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3834 MIB.add(MI.getOperand(I));
3835 updateLiveVariables(LV, MI, *MIB);
3836 if (LIS) {
3837 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3838 // SlotIndex of defs needs to be updated when converting to early-clobber
3839 MachineOperand &Def = MIB->getOperand(0);
3840 if (Def.isEarlyClobber() && Def.isReg() &&
3841 LIS->hasInterval(Def.getReg())) {
3842 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3843 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3844 auto &LI = LIS->getInterval(Def.getReg());
3845 auto UpdateDefIndex = [&](LiveRange &LR) {
3846 auto S = LR.find(OldIndex);
3847 if (S != LR.end() && S->start == OldIndex) {
3848 assert(S->valno && S->valno->def == OldIndex);
3849 S->start = NewIndex;
3850 S->valno->def = NewIndex;
3851 }
3852 };
3853 UpdateDefIndex(LI);
3854 for (auto &SR : LI.subranges())
3855 UpdateDefIndex(SR);
3856 }
3857 }
3858 return MIB;
3859 }
3860
3861 if (SIInstrInfo::isWMMA(MI)) {
3862 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3863 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3864 .setMIFlags(MI.getFlags());
3865 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3866 MIB->addOperand(MI.getOperand(I));
3867
3868 updateLiveVariables(LV, MI, *MIB);
3869 if (LIS)
3870 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3871
3872 return MIB;
3873 }
3874
3875 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3876 "V_FMAC_F16_t16_e32 is not supported and not expected to be present "
3877 "pre-RA");
3878
3879 // Handle MAC/FMAC.
3880 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3881 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3882 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3883 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3884 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3885 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3886 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3887 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3888 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3889 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3890 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3891 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3892 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3893 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3894 bool Src0Literal = false;
3895
3896 switch (Opc) {
3897 default:
3898 return nullptr;
3899 case AMDGPU::V_MAC_F16_e64:
3900 case AMDGPU::V_FMAC_F16_e64:
3901 case AMDGPU::V_FMAC_F16_t16_e64:
3902 case AMDGPU::V_MAC_F32_e64:
3903 case AMDGPU::V_MAC_LEGACY_F32_e64:
3904 case AMDGPU::V_FMAC_F32_e64:
3905 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3906 case AMDGPU::V_FMAC_F64_e64:
3907 break;
3908 case AMDGPU::V_MAC_F16_e32:
3909 case AMDGPU::V_FMAC_F16_e32:
3910 case AMDGPU::V_MAC_F32_e32:
3911 case AMDGPU::V_MAC_LEGACY_F32_e32:
3912 case AMDGPU::V_FMAC_F32_e32:
3913 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3914 case AMDGPU::V_FMAC_F64_e32: {
3915 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3916 AMDGPU::OpName::src0);
3917 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3918 if (!Src0->isReg() && !Src0->isImm())
3919 return nullptr;
3920
3921 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3922 Src0Literal = true;
3923
3924 break;
3925 }
3926 }
3927
3929 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3930 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3931 const MachineOperand *Src0Mods =
3932 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3933 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3934 const MachineOperand *Src1Mods =
3935 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3936 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3937 const MachineOperand *Src2Mods =
3938 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3939 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3940 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3941 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3942
3943 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3944 !IsLegacy &&
3945 // If we have an SGPR input, we will violate the constant bus restriction.
3946 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3947 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3949 const auto killDef = [&]() -> void {
3951 // The only user is the instruction which will be killed.
3952 Register DefReg = DefMI->getOperand(0).getReg();
3953 if (!MRI.hasOneNonDBGUse(DefReg))
3954 return;
3955 // We cannot just remove the DefMI here, calling pass will crash.
3956 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3957 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3959 if (LV)
3960 LV->getVarInfo(DefReg).AliveBlocks.clear();
3961 };
3962
3963 int64_t Imm;
3964 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3965 unsigned NewOpc =
3966 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3967 : AMDGPU::V_FMAAK_F16)
3968 : AMDGPU::V_FMAAK_F32)
3969 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3970 if (pseudoToMCOpcode(NewOpc) != -1) {
3971 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3972 .add(*Dst)
3973 .add(*Src0)
3974 .add(*Src1)
3975 .addImm(Imm)
3976 .setMIFlags(MI.getFlags());
3977 updateLiveVariables(LV, MI, *MIB);
3978 if (LIS)
3979 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3980 killDef();
3981 return MIB;
3982 }
3983 }
3984 unsigned NewOpc =
3985 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3986 : AMDGPU::V_FMAMK_F16)
3987 : AMDGPU::V_FMAMK_F32)
3988 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3989 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3990 if (pseudoToMCOpcode(NewOpc) != -1) {
3991 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3992 .add(*Dst)
3993 .add(*Src0)
3994 .addImm(Imm)
3995 .add(*Src2)
3996 .setMIFlags(MI.getFlags());
3997 updateLiveVariables(LV, MI, *MIB);
3998 if (LIS)
3999 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4000 killDef();
4001 return MIB;
4002 }
4003 }
4004 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4005 if (Src0Literal) {
4006 Imm = Src0->getImm();
4007 DefMI = nullptr;
4008 }
4009 if (pseudoToMCOpcode(NewOpc) != -1 &&
4011 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4012 Src1)) {
4013 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4014 .add(*Dst)
4015 .add(*Src1)
4016 .addImm(Imm)
4017 .add(*Src2)
4018 .setMIFlags(MI.getFlags());
4019 updateLiveVariables(LV, MI, *MIB);
4020 if (LIS)
4021 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4022 if (DefMI)
4023 killDef();
4024 return MIB;
4025 }
4026 }
4027 }
4028
4029 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4030 // if VOP3 does not allow a literal operand.
4031 if (Src0Literal && !ST.hasVOP3Literal())
4032 return nullptr;
4033
4034 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
4035 : IsF64 ? AMDGPU::V_FMA_F64_e64
4036 : IsLegacy
4037 ? AMDGPU::V_FMA_LEGACY_F32_e64
4038 : AMDGPU::V_FMA_F32_e64
4039 : IsF16 ? AMDGPU::V_MAD_F16_e64
4040 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
4041 : AMDGPU::V_MAD_F32_e64;
4042 if (pseudoToMCOpcode(NewOpc) == -1)
4043 return nullptr;
4044
4045 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4046 .add(*Dst)
4047 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4048 .add(*Src0)
4049 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4050 .add(*Src1)
4051 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4052 .add(*Src2)
4053 .addImm(Clamp ? Clamp->getImm() : 0)
4054 .addImm(Omod ? Omod->getImm() : 0)
4055 .setMIFlags(MI.getFlags());
4056 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4057 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4058 updateLiveVariables(LV, MI, *MIB);
4059 if (LIS)
4060 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4061 return MIB;
4062}
4063
4064// It's not generally safe to move VALU instructions across these since it will
4065// start using the register as a base index rather than directly.
4066// XXX - Why isn't hasSideEffects sufficient for these?
4068 switch (MI.getOpcode()) {
4069 case AMDGPU::S_SET_GPR_IDX_ON:
4070 case AMDGPU::S_SET_GPR_IDX_MODE:
4071 case AMDGPU::S_SET_GPR_IDX_OFF:
4072 return true;
4073 default:
4074 return false;
4075 }
4076}
4077
4079 const MachineBasicBlock *MBB,
4080 const MachineFunction &MF) const {
4081 // Skipping the check for SP writes in the base implementation. The reason it
4082 // was added was apparently due to compile time concerns.
4083 //
4084 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4085 // but is probably avoidable.
4086
4087 // Copied from base implementation.
4088 // Terminators and labels can't be scheduled around.
4089 if (MI.isTerminator() || MI.isPosition())
4090 return true;
4091
4092 // INLINEASM_BR can jump to another block
4093 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4094 return true;
4095
4096 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4097 return true;
4098
4099 // Target-independent instructions do not have an implicit-use of EXEC, even
4100 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4101 // boundaries prevents incorrect movements of such instructions.
4102 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4103 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4104 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4105 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4107}
4108
4110 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4111}
4112
4114 // Skip the full operand and register alias search modifiesRegister
4115 // does. There's only a handful of instructions that touch this, it's only an
4116 // implicit def, and doesn't alias any other registers.
4117 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4118}
4119
4121 unsigned Opcode = MI.getOpcode();
4122
4123 if (MI.mayStore() && isSMRD(MI))
4124 return true; // scalar store or atomic
4125
4126 // This will terminate the function when other lanes may need to continue.
4127 if (MI.isReturn())
4128 return true;
4129
4130 // These instructions cause shader I/O that may cause hardware lockups
4131 // when executed with an empty EXEC mask.
4132 //
4133 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4134 // EXEC = 0, but checking for that case here seems not worth it
4135 // given the typical code patterns.
4136 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4137 isEXP(Opcode) ||
4138 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
4139 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
4140 return true;
4141
4142 if (MI.isCall() || MI.isInlineAsm())
4143 return true; // conservative assumption
4144
4145 // A mode change is a scalar operation that influences vector instructions.
4147 return true;
4148
4149 // These are like SALU instructions in terms of effects, so it's questionable
4150 // whether we should return true for those.
4151 //
4152 // However, executing them with EXEC = 0 causes them to operate on undefined
4153 // data, which we avoid by returning true here.
4154 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4155 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4156 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4157 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4158 return true;
4159
4160 return false;
4161}
4162
4164 const MachineInstr &MI) const {
4165 if (MI.isMetaInstruction())
4166 return false;
4167
4168 // This won't read exec if this is an SGPR->SGPR copy.
4169 if (MI.isCopyLike()) {
4170 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4171 return true;
4172
4173 // Make sure this isn't copying exec as a normal operand
4174 return MI.readsRegister(AMDGPU::EXEC, &RI);
4175 }
4176
4177 // Make a conservative assumption about the callee.
4178 if (MI.isCall())
4179 return true;
4180
4181 // Be conservative with any unhandled generic opcodes.
4182 if (!isTargetSpecificOpcode(MI.getOpcode()))
4183 return true;
4184
4185 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4186}
4187
4188bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4189 switch (Imm.getBitWidth()) {
4190 case 1: // This likely will be a condition code mask.
4191 return true;
4192
4193 case 32:
4194 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4195 ST.hasInv2PiInlineImm());
4196 case 64:
4197 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4198 ST.hasInv2PiInlineImm());
4199 case 16:
4200 return ST.has16BitInsts() &&
4201 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4202 ST.hasInv2PiInlineImm());
4203 default:
4204 llvm_unreachable("invalid bitwidth");
4205 }
4206}
4207
4209 APInt IntImm = Imm.bitcastToAPInt();
4210 int64_t IntImmVal = IntImm.getSExtValue();
4211 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4212 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4213 default:
4214 llvm_unreachable("invalid fltSemantics");
4217 return isInlineConstant(IntImm);
4219 return ST.has16BitInsts() &&
4220 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4222 return ST.has16BitInsts() &&
4223 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4224 }
4225}
4226
4228 uint8_t OperandType) const {
4229 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4230 if (!MO.isImm())
4231 return false;
4232
4233 // MachineOperand provides no way to tell the true operand size, since it only
4234 // records a 64-bit value. We need to know the size to determine if a 32-bit
4235 // floating point immediate bit pattern is legal for an integer immediate. It
4236 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4237
4238 int64_t Imm = MO.getImm();
4239 switch (OperandType) {
4252 int32_t Trunc = static_cast<int32_t>(Imm);
4254 }
4261 ST.hasInv2PiInlineImm());
4265 // We would expect inline immediates to not be concerned with an integer/fp
4266 // distinction. However, in the case of 16-bit integer operations, the
4267 // "floating point" values appear to not work. It seems read the low 16-bits
4268 // of 32-bit immediates, which happens to always work for the integer
4269 // values.
4270 //
4271 // See llvm bugzilla 46302.
4272 //
4273 // TODO: Theoretically we could use op-sel to use the high bits of the
4274 // 32-bit FP values.
4292 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4293 // A few special case instructions have 16-bit operands on subtargets
4294 // where 16-bit instructions are not legal.
4295 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4296 // constants in these cases
4297 int16_t Trunc = static_cast<int16_t>(Imm);
4298 return ST.has16BitInsts() &&
4300 }
4301
4302 return false;
4303 }
4308 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4309 int16_t Trunc = static_cast<int16_t>(Imm);
4310 return ST.has16BitInsts() &&
4312 }
4313 return false;
4314 }
4317 return false;
4320 // Always embedded in the instruction for free.
4321 return true;
4331 // Just ignore anything else.
4332 return true;
4333 default:
4334 llvm_unreachable("invalid operand type");
4335 }
4336}
4337
4338static bool compareMachineOp(const MachineOperand &Op0,
4339 const MachineOperand &Op1) {
4340 if (Op0.getType() != Op1.getType())
4341 return false;
4342
4343 switch (Op0.getType()) {
4345 return Op0.getReg() == Op1.getReg();
4347 return Op0.getImm() == Op1.getImm();
4348 default:
4349 llvm_unreachable("Didn't expect to be comparing these operand types");
4350 }
4351}
4352
4354 const MachineOperand &MO) const {
4355 const MCInstrDesc &InstDesc = MI.getDesc();
4356 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4357
4358 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4359
4361 return true;
4362
4363 if (OpInfo.RegClass < 0)
4364 return false;
4365
4366 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4367 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4368 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4369 AMDGPU::OpName::src2))
4370 return false;
4371 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4372 }
4373
4374 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4375 return false;
4376
4377 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4378 return true;
4379
4380 return ST.hasVOP3Literal();
4381}
4382
4383bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4384 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4385 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4386 return false;
4387
4388 int Op32 = AMDGPU::getVOPe32(Opcode);
4389 if (Op32 == -1)
4390 return false;
4391
4392 return pseudoToMCOpcode(Op32) != -1;
4393}
4394
4395bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4396 // The src0_modifier operand is present on all instructions
4397 // that have modifiers.
4398
4399 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4400}
4401
4403 unsigned OpName) const {
4404 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4405 return Mods && Mods->getImm();
4406}
4407
4409 return any_of(ModifierOpNames,
4410 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4411}
4412
4414 const MachineRegisterInfo &MRI) const {
4415 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4416 // Can't shrink instruction with three operands.
4417 if (Src2) {
4418 switch (MI.getOpcode()) {
4419 default: return false;
4420
4421 case AMDGPU::V_ADDC_U32_e64:
4422 case AMDGPU::V_SUBB_U32_e64:
4423 case AMDGPU::V_SUBBREV_U32_e64: {
4424 const MachineOperand *Src1
4425 = getNamedOperand(MI, AMDGPU::OpName::src1);
4426 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4427 return false;
4428 // Additional verification is needed for sdst/src2.
4429 return true;
4430 }
4431 case AMDGPU::V_MAC_F16_e64:
4432 case AMDGPU::V_MAC_F32_e64:
4433 case AMDGPU::V_MAC_LEGACY_F32_e64:
4434 case AMDGPU::V_FMAC_F16_e64:
4435 case AMDGPU::V_FMAC_F16_t16_e64:
4436 case AMDGPU::V_FMAC_F32_e64:
4437 case AMDGPU::V_FMAC_F64_e64:
4438 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4439 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4440 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4441 return false;
4442 break;
4443
4444 case AMDGPU::V_CNDMASK_B32_e64:
4445 break;
4446 }
4447 }
4448
4449 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4450 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4451 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4452 return false;
4453
4454 // We don't need to check src0, all input types are legal, so just make sure
4455 // src0 isn't using any modifiers.
4456 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4457 return false;
4458
4459 // Can it be shrunk to a valid 32 bit opcode?
4460 if (!hasVALU32BitEncoding(MI.getOpcode()))
4461 return false;
4462
4463 // Check output modifiers
4464 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4465 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4466 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel);
4467}
4468
4469// Set VCC operand with all flags from \p Orig, except for setting it as
4470// implicit.
4472 const MachineOperand &Orig) {
4473
4474 for (MachineOperand &Use : MI.implicit_operands()) {
4475 if (Use.isUse() &&
4476 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4477 Use.setIsUndef(Orig.isUndef());
4478 Use.setIsKill(Orig.isKill());
4479 return;
4480 }
4481 }
4482}
4483
4485 unsigned Op32) const {
4486 MachineBasicBlock *MBB = MI.getParent();
4487
4488 const MCInstrDesc &Op32Desc = get(Op32);
4489 MachineInstrBuilder Inst32 =
4490 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4491 .setMIFlags(MI.getFlags());
4492
4493 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4494 // For VOPC instructions, this is replaced by an implicit def of vcc.
4495
4496 // We assume the defs of the shrunk opcode are in the same order, and the
4497 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4498 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4499 Inst32.add(MI.getOperand(I));
4500
4501 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4502
4503 int Idx = MI.getNumExplicitDefs();
4504 for (const MachineOperand &Use : MI.explicit_uses()) {
4505 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4507 continue;
4508
4509 if (&Use == Src2) {
4510 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4511 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4512 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4513 // of vcc was already added during the initial BuildMI, but we
4514 // 1) may need to change vcc to vcc_lo to preserve the original register
4515 // 2) have to preserve the original flags.
4516 fixImplicitOperands(*Inst32);
4517 copyFlagsToImplicitVCC(*Inst32, *Src2);
4518 continue;
4519 }
4520 }
4521
4522 Inst32.add(Use);
4523 }
4524
4525 // FIXME: Losing implicit operands
4526
4527 return Inst32;
4528}
4529
4531 const MachineOperand &MO,
4532 const MCOperandInfo &OpInfo) const {
4533 // Literal constants use the constant bus.
4534 if (!MO.isReg())
4535 return !isInlineConstant(MO, OpInfo);
4536
4537 if (!MO.isUse())
4538 return false;
4539
4540 if (MO.getReg().isVirtual())
4541 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4542
4543 // Null is free
4544 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4545 return false;
4546
4547 // SGPRs use the constant bus
4548 if (MO.isImplicit()) {
4549 return MO.getReg() == AMDGPU::M0 ||
4550 MO.getReg() == AMDGPU::VCC ||
4551 MO.getReg() == AMDGPU::VCC_LO;
4552 } else {
4553 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4554 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4555 }
4556}
4557
4559 for (const MachineOperand &MO : MI.implicit_operands()) {
4560 // We only care about reads.
4561 if (MO.isDef())
4562 continue;
4563
4564 switch (MO.getReg()) {
4565 case AMDGPU::VCC:
4566 case AMDGPU::VCC_LO:
4567 case AMDGPU::VCC_HI:
4568 case AMDGPU::M0:
4569 case AMDGPU::FLAT_SCR:
4570 return MO.getReg();
4571
4572 default:
4573 break;
4574 }
4575 }
4576
4577 return Register();
4578}
4579
4580static bool shouldReadExec(const MachineInstr &MI) {
4581 if (SIInstrInfo::isVALU(MI)) {
4582 switch (MI.getOpcode()) {
4583 case AMDGPU::V_READLANE_B32:
4584 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4585 case AMDGPU::V_WRITELANE_B32:
4586 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4587 return false;
4588 }
4589
4590 return true;
4591 }
4592
4593 if (MI.isPreISelOpcode() ||
4594 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4597 return false;
4598
4599 return true;
4600}
4601
4602static bool isSubRegOf(const SIRegisterInfo &TRI,
4603 const MachineOperand &SuperVec,
4604 const MachineOperand &SubReg) {
4605 if (SubReg.getReg().isPhysical())
4606 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4607
4608 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4609 SubReg.getReg() == SuperVec.getReg();
4610}
4611
4613 StringRef &ErrInfo) const {
4614 uint16_t Opcode = MI.getOpcode();
4615 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4616 return true;
4617
4618 const MachineFunction *MF = MI.getParent()->getParent();
4619 const MachineRegisterInfo &MRI = MF->getRegInfo();
4620
4621 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4622 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4623 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4624 int Src3Idx = -1;
4625 if (Src0Idx == -1) {
4626 // VOPD V_DUAL_* instructions use different operand names.
4627 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4628 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4629 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4630 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4631 }
4632
4633 // Make sure the number of operands is correct.
4634 const MCInstrDesc &Desc = get(Opcode);
4635 if (!Desc.isVariadic() &&
4636 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4637 ErrInfo = "Instruction has wrong number of operands.";
4638 return false;
4639 }
4640
4641 if (MI.isInlineAsm()) {
4642 // Verify register classes for inlineasm constraints.
4643 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4644 I != E; ++I) {
4645 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4646 if (!RC)
4647 continue;
4648
4649 const MachineOperand &Op = MI.getOperand(I);
4650 if (!Op.isReg())
4651 continue;
4652
4653 Register Reg = Op.getReg();
4654 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4655 ErrInfo = "inlineasm operand has incorrect register class.";
4656 return false;
4657 }
4658 }
4659
4660 return true;
4661 }
4662
4663 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4664 ErrInfo = "missing memory operand from image instruction.";
4665 return false;
4666 }
4667
4668 // Make sure the register classes are correct.
4669 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4670 const MachineOperand &MO = MI.getOperand(i);
4671 if (MO.isFPImm()) {
4672 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4673 "all fp values to integers.";
4674 return false;
4675 }
4676
4677 int RegClass = Desc.operands()[i].RegClass;
4678
4679 switch (Desc.operands()[i].OperandType) {
4681 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4682 ErrInfo = "Illegal immediate value for operand.";
4683 return false;
4684 }
4685 break;
4690 break;
4702 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4703 ErrInfo = "Illegal immediate value for operand.";
4704 return false;
4705 }
4706 break;
4707 }
4709 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4710 ErrInfo = "Expected inline constant for operand.";
4711 return false;
4712 }
4713 break;
4716 // Check if this operand is an immediate.
4717 // FrameIndex operands will be replaced by immediates, so they are
4718 // allowed.
4719 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4720 ErrInfo = "Expected immediate, but got non-immediate";
4721 return false;
4722 }
4723 [[fallthrough]];
4724 default:
4725 continue;
4726 }
4727
4728 if (!MO.isReg())
4729 continue;
4730 Register Reg = MO.getReg();
4731 if (!Reg)
4732 continue;
4733
4734 // FIXME: Ideally we would have separate instruction definitions with the
4735 // aligned register constraint.
4736 // FIXME: We do not verify inline asm operands, but custom inline asm
4737 // verification is broken anyway
4738 if (ST.needsAlignedVGPRs()) {
4739 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4740 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4741 const TargetRegisterClass *SubRC =
4742 RI.getSubRegisterClass(RC, MO.getSubReg());
4743 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4744 if (RC)
4745 RC = SubRC;
4746 }
4747
4748 // Check that this is the aligned version of the class.
4749 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4750 ErrInfo = "Subtarget requires even aligned vector registers";
4751 return false;
4752 }
4753 }
4754
4755 if (RegClass != -1) {
4756 if (Reg.isVirtual())
4757 continue;
4758
4759 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4760 if (!RC->contains(Reg)) {
4761 ErrInfo = "Operand has incorrect register class.";
4762 return false;
4763 }
4764 }
4765 }
4766
4767 // Verify SDWA
4768 if (isSDWA(MI)) {
4769 if (!ST.hasSDWA()) {
4770 ErrInfo = "SDWA is not supported on this target";
4771 return false;
4772 }
4773
4774 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4775
4776 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4777 if (OpIdx == -1)
4778 continue;
4779 const MachineOperand &MO = MI.getOperand(OpIdx);
4780
4781 if (!ST.hasSDWAScalar()) {
4782 // Only VGPRS on VI
4783 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4784 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4785 return false;
4786 }
4787 } else {
4788 // No immediates on GFX9
4789 if (!MO.isReg()) {
4790 ErrInfo =
4791 "Only reg allowed as operands in SDWA instructions on GFX9+";
4792 return false;
4793 }
4794 }
4795 }
4796
4797 if (!ST.hasSDWAOmod()) {
4798 // No omod allowed on VI
4799 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4800 if (OMod != nullptr &&
4801 (!OMod->isImm() || OMod->getImm() != 0)) {
4802 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4803 return false;
4804 }
4805 }
4806
4807 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4808 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4809 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4810 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4811 const MachineOperand *Src0ModsMO =
4812 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4813 unsigned Mods = Src0ModsMO->getImm();
4814 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4815 Mods & SISrcMods::SEXT) {
4816 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4817 return false;
4818 }
4819 }
4820
4821 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4822 if (isVOPC(BasicOpcode)) {
4823 if (!ST.hasSDWASdst() && DstIdx != -1) {
4824 // Only vcc allowed as dst on VI for VOPC
4825 const MachineOperand &Dst = MI.getOperand(DstIdx);
4826 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4827 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4828 return false;
4829 }
4830 } else if (!ST.hasSDWAOutModsVOPC()) {
4831 // No clamp allowed on GFX9 for VOPC
4832 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4833 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4834 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4835 return false;
4836 }
4837
4838 // No omod allowed on GFX9 for VOPC
4839 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4840 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4841 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4842 return false;
4843 }
4844 }
4845 }
4846
4847 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4848 if (DstUnused && DstUnused->isImm() &&
4849 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4850 const MachineOperand &Dst = MI.getOperand(DstIdx);
4851 if (!Dst.isReg() || !Dst.isTied()) {
4852 ErrInfo = "Dst register should have tied register";
4853 return false;
4854 }
4855
4856 const MachineOperand &TiedMO =
4857 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4858 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4859 ErrInfo =
4860 "Dst register should be tied to implicit use of preserved register";
4861 return false;
4862 } else if (TiedMO.getReg().isPhysical() &&
4863 Dst.getReg() != TiedMO.getReg()) {
4864 ErrInfo = "Dst register should use same physical register as preserved";
4865 return false;
4866 }
4867 }
4868 }
4869
4870 // Verify MIMG / VIMAGE / VSAMPLE
4871 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4872 // Ensure that the return type used is large enough for all the options
4873 // being used TFE/LWE require an extra result register.
4874 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4875 if (DMask) {
4876 uint64_t DMaskImm = DMask->getImm();
4877 uint32_t RegCount =
4878 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
4879 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4880 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4881 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4882
4883 // Adjust for packed 16 bit values
4884 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4885 RegCount = divideCeil(RegCount, 2);
4886
4887 // Adjust if using LWE or TFE
4888 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4889 RegCount += 1;
4890
4891 const uint32_t DstIdx =
4892 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4893 const MachineOperand &Dst = MI.getOperand(DstIdx);
4894 if (Dst.isReg()) {
4895 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4896 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4897 if (RegCount > DstSize) {
4898 ErrInfo = "Image instruction returns too many registers for dst "
4899 "register class";
4900 return false;
4901 }
4902 }
4903 }
4904 }
4905
4906 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4907 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
4908 unsigned ConstantBusCount = 0;
4909 bool UsesLiteral = false;
4910 const MachineOperand *LiteralVal = nullptr;
4911
4912 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
4913 if (ImmIdx != -1) {
4914 ++ConstantBusCount;
4915 UsesLiteral = true;
4916 LiteralVal = &MI.getOperand(ImmIdx);
4917 }
4918
4919 SmallVector<Register, 2> SGPRsUsed;
4920 Register SGPRUsed;
4921
4922 // Only look at the true operands. Only a real operand can use the constant
4923 // bus, and we don't want to check pseudo-operands like the source modifier
4924 // flags.
4925 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
4926 if (OpIdx == -1)
4927 continue;
4928 const MachineOperand &MO = MI.getOperand(OpIdx);
4929 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4930 if (MO.isReg()) {
4931 SGPRUsed = MO.getReg();
4932 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
4933 ++ConstantBusCount;
4934 SGPRsUsed.push_back(SGPRUsed);
4935 }
4936 } else {
4937 if (!UsesLiteral) {
4938 ++ConstantBusCount;
4939 UsesLiteral = true;
4940 LiteralVal = &MO;
4941 } else if (!MO.isIdenticalTo(*LiteralVal)) {
4942 assert(isVOP2(MI) || isVOP3(MI));
4943 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
4944 return false;
4945 }
4946 }
4947 }
4948 }
4949
4950 SGPRUsed = findImplicitSGPRRead(MI);
4951 if (SGPRUsed) {
4952 // Implicit uses may safely overlap true operands
4953 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4954 return !RI.regsOverlap(SGPRUsed, SGPR);
4955 })) {
4956 ++ConstantBusCount;
4957 SGPRsUsed.push_back(SGPRUsed);
4958 }
4959 }
4960
4961 // v_writelane_b32 is an exception from constant bus restriction:
4962 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4963 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4964 Opcode != AMDGPU::V_WRITELANE_B32) {
4965 ErrInfo = "VOP* instruction violates constant bus restriction";
4966 return false;
4967 }
4968
4969 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4970 ErrInfo = "VOP3 instruction uses literal";
4971 return false;
4972 }
4973 }
4974
4975 // Special case for writelane - this can break the multiple constant bus rule,
4976 // but still can't use more than one SGPR register
4977 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4978 unsigned SGPRCount = 0;
4979 Register SGPRUsed;
4980
4981 for (int OpIdx : {Src0Idx, Src1Idx}) {
4982 if (OpIdx == -1)
4983 break;
4984
4985 const MachineOperand &MO = MI.getOperand(OpIdx);
4986
4987 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4988 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4989 if (MO.getReg() != SGPRUsed)
4990 ++SGPRCount;
4991 SGPRUsed = MO.getReg();
4992 }
4993 }
4994 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
4995 ErrInfo = "WRITELANE instruction violates constant bus restriction";
4996 return false;
4997 }
4998 }
4999 }
5000
5001 // Verify misc. restrictions on specific instructions.
5002 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5003 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5004 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5005 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5006 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5007 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5008 if (!compareMachineOp(Src0, Src1) &&
5009 !compareMachineOp(Src0, Src2)) {
5010 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5011 return false;
5012 }
5013 }
5014 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5015 SISrcMods::ABS) ||
5016 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5017 SISrcMods::ABS) ||
5018 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5019 SISrcMods::ABS)) {
5020 ErrInfo = "ABS not allowed in VOP3B instructions";
5021 return false;
5022 }
5023 }
5024
5025 if (isSOP2(MI) || isSOPC(MI)) {
5026 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5027 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5028
5029 if (!Src0.isReg() && !Src1.isReg() &&
5030 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5031 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5032 !Src0.isIdenticalTo(Src1)) {
5033 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5034 return false;
5035 }
5036 }
5037
5038 if (isSOPK(MI)) {
5039 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5040 if (Desc.isBranch()) {
5041 if (!Op->isMBB()) {
5042 ErrInfo = "invalid branch target for SOPK instruction";
5043 return false;
5044 }
5045 } else {
5046 uint64_t Imm = Op->getImm();
5047 if (sopkIsZext(Opcode)) {
5048 if (!isUInt<16>(Imm)) {
5049 ErrInfo = "invalid immediate for SOPK instruction";
5050 return false;
5051 }
5052 } else {
5053 if (!isInt<16>(Imm)) {
5054 ErrInfo = "invalid immediate for SOPK instruction";
5055 return false;
5056 }
5057 }
5058 }
5059 }
5060
5061 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5062 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5063 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5064 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5065 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5066 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5067
5068 const unsigned StaticNumOps =
5069 Desc.getNumOperands() + Desc.implicit_uses().size();
5070 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5071
5072 // Allow additional implicit operands. This allows a fixup done by the post
5073 // RA scheduler where the main implicit operand is killed and implicit-defs
5074 // are added for sub-registers that remain live after this instruction.
5075 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5076 ErrInfo = "missing implicit register operands";
5077 return false;
5078 }
5079
5080 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5081 if (IsDst) {
5082 if (!Dst->isUse()) {
5083 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5084 return false;
5085 }
5086
5087 unsigned UseOpIdx;
5088 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5089 UseOpIdx != StaticNumOps + 1) {
5090 ErrInfo = "movrel implicit operands should be tied";
5091 return false;
5092 }
5093 }
5094
5095 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5096 const MachineOperand &ImpUse
5097 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5098 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5099 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5100 ErrInfo = "src0 should be subreg of implicit vector use";
5101 return false;
5102 }
5103 }
5104
5105 // Make sure we aren't losing exec uses in the td files. This mostly requires
5106 // being careful when using let Uses to try to add other use registers.
5107 if (shouldReadExec(MI)) {
5108 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5109 ErrInfo = "VALU instruction does not implicitly read exec mask";
5110 return false;
5111 }
5112 }
5113
5114 if (isSMRD(MI)) {
5115 if (MI.mayStore() &&
5117 // The register offset form of scalar stores may only use m0 as the
5118 // soffset register.
5119 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5120 if (Soff && Soff->getReg() != AMDGPU::M0) {
5121 ErrInfo = "scalar stores must use m0 as offset register";
5122 return false;
5123 }
5124 }
5125 }
5126
5127 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5128 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5129 if (Offset->getImm() != 0) {
5130 ErrInfo = "subtarget does not support offsets in flat instructions";
5131 return false;
5132 }
5133 }
5134
5135 if (isDS(MI) && !ST.hasGDS()) {
5136 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5137 if (GDSOp && GDSOp->getImm() != 0) {
5138 ErrInfo = "GDS is not supported on this subtarget";
5139 return false;
5140 }
5141 }
5142
5143 if (isImage(MI)) {
5144 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5145 if (DimOp) {
5146 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5147 AMDGPU::OpName::vaddr0);
5148 int RSrcOpName =
5149 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5150 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5151 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5152 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5154 const AMDGPU::MIMGDimInfo *Dim =
5156
5157 if (!Dim) {
5158 ErrInfo = "dim is out of range";
5159 return false;
5160 }
5161
5162 bool IsA16 = false;
5163 if (ST.hasR128A16()) {
5164 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5165 IsA16 = R128A16->getImm() != 0;
5166 } else if (ST.hasA16()) {
5167 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5168 IsA16 = A16->getImm() != 0;
5169 }
5170
5171 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5172
5173 unsigned AddrWords =
5174 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5175
5176 unsigned VAddrWords;
5177 if (IsNSA) {
5178 VAddrWords = RsrcIdx - VAddr0Idx;
5179 if (ST.hasPartialNSAEncoding() &&
5180 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5181 unsigned LastVAddrIdx = RsrcIdx - 1;
5182 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5183 }
5184 } else {
5185 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5186 if (AddrWords > 12)
5187 AddrWords = 16;
5188 }
5189
5190 if (VAddrWords != AddrWords) {
5191 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5192 << " but got " << VAddrWords << "\n");
5193 ErrInfo = "bad vaddr size";
5194 return false;
5195 }
5196 }
5197 }
5198
5199 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5200 if (DppCt) {
5201 using namespace AMDGPU::DPP;
5202
5203 unsigned DC = DppCt->getImm();
5204 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5205 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5206 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5207 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5208 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5209 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5210 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5211 ErrInfo = "Invalid dpp_ctrl value";
5212 return false;
5213 }
5214 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5216 ErrInfo = "Invalid dpp_ctrl value: "
5217 "wavefront shifts are not supported on GFX10+";
5218 return false;
5219 }
5220 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5222 ErrInfo = "Invalid dpp_ctrl value: "
5223 "broadcasts are not supported on GFX10+";
5224 return false;
5225 }
5226 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5228 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5229 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5230 !ST.hasGFX90AInsts()) {
5231 ErrInfo = "Invalid dpp_ctrl value: "
5232 "row_newbroadcast/row_share is not supported before "
5233 "GFX90A/GFX10";
5234 return false;
5235 } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5236 ErrInfo = "Invalid dpp_ctrl value: "
5237 "row_share and row_xmask are not supported before GFX10";
5238 return false;
5239 }
5240 }
5241
5242 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5244 ErrInfo = "Invalid dpp_ctrl value: "
5245 "DP ALU dpp only support row_newbcast";
5246 return false;
5247 }
5248 }
5249
5250 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5251 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5252 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5253 : AMDGPU::OpName::vdata;
5254 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5255 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5256 if (Data && !Data->isReg())
5257 Data = nullptr;
5258
5259 if (ST.hasGFX90AInsts()) {
5260 if (Dst && Data &&
5261 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5262 ErrInfo = "Invalid register class: "
5263 "vdata and vdst should be both VGPR or AGPR";
5264 return false;
5265 }
5266 if (Data && Data2 &&
5267 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5268 ErrInfo = "Invalid register class: "
5269 "both data operands should be VGPR or AGPR";
5270 return false;
5271 }
5272 } else {
5273 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5274 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5275 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5276 ErrInfo = "Invalid register class: "
5277 "agpr loads and stores not supported on this GPU";
5278 return false;
5279 }
5280 }
5281 }
5282
5283 if (ST.needsAlignedVGPRs()) {
5284 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5286 if (!Op)
5287 return true;
5288 Register Reg = Op->getReg();
5289 if (Reg.isPhysical())
5290 return !(RI.getHWRegIndex(Reg) & 1);
5291 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5292 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5293 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5294 };
5295
5296 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5297 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5298 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5299
5300 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5301 ErrInfo = "Subtarget requires even aligned vector registers "
5302 "for DS_GWS instructions";
5303 return false;
5304 }
5305 }
5306
5307 if (isMIMG(MI)) {
5308 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5309 ErrInfo = "Subtarget requires even aligned vector registers "
5310 "for vaddr operand of image instructions";
5311 return false;
5312 }
5313 }
5314 }
5315
5316 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5317 !ST.hasGFX90AInsts()) {
5318 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5319 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5320 ErrInfo = "Invalid register class: "
5321 "v_accvgpr_write with an SGPR is not supported on this GPU";
5322 return false;
5323 }
5324 }
5325
5326 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5327 const MachineOperand &SrcOp = MI.getOperand(1);
5328 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5329 ErrInfo = "pseudo expects only physical SGPRs";
5330 return false;
5331 }
5332 }
5333
5334 return true;
5335}
5336
5337// It is more readable to list mapped opcodes on the same line.
5338// clang-format off
5339
5341 switch (MI.getOpcode()) {
5342 default: return AMDGPU::INSTRUCTION_LIST_END;
5343 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5344 case AMDGPU::COPY: return AMDGPU::COPY;
5345 case AMDGPU::PHI: return AMDGPU::PHI;
5346 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5347 case AMDGPU::WQM: return AMDGPU::WQM;
5348 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5349 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5350 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5351 case AMDGPU::S_MOV_B32: {
5352 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5353 return MI.getOperand(1).isReg() ||
5354 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5355 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5356 }
5357 case AMDGPU::S_ADD_I32:
5358 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5359 case AMDGPU::S_ADDC_U32:
5360 return AMDGPU::V_ADDC_U32_e32;
5361 case AMDGPU::S_SUB_I32:
5362 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5363 // FIXME: These are not consistently handled, and selected when the carry is
5364 // used.
5365 case AMDGPU::S_ADD_U32:
5366 return AMDGPU::V_ADD_CO_U32_e32;
5367 case AMDGPU::S_SUB_U32:
5368 return AMDGPU::V_SUB_CO_U32_e32;
5369 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5370 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5371 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5372 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5373 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5374 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5375 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5376 case AMDGPU::S_XNOR_B32:
5377 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5378 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5379 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5380 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5381 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5382 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5383 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5384 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5385 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5386 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5387 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5388 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5389 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5390 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5391 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5392 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5393 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5394 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5395 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5396 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5397 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5398 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5399 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5400 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5401 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5402 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5403 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5404 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5405 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5406 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5407 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5408 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5409 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5410 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5411 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5412 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5413 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5414 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5415 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5416 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5417 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5418 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5419 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5420 case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5421 case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5422 case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64;
5423 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5424 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5425 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5426 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5427 case AMDGPU::S_CEIL_F16:
5428 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5429 : AMDGPU::V_CEIL_F16_fake16_e64;
5430 case AMDGPU::S_FLOOR_F16:
5431 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5432 : AMDGPU::V_FLOOR_F16_fake16_e64;
5433 case AMDGPU::S_TRUNC_F16:
5434 return AMDGPU::V_TRUNC_F16_fake16_e64;
5435 case AMDGPU::S_RNDNE_F16:
5436 return AMDGPU::V_RNDNE_F16_fake16_e64;
5437 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5438 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5439 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5440 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5441 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5442 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5443 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5444 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5445 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5446 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5447 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5448 case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5449 case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5450 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5451 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5452 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5453 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
5454 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5455 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5456 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5457 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5458 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5459 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5460 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5461 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5462 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5463 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5464 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5465 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5466 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5467 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5468 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5469 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5470 case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64;
5471 case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64;
5472 case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64;
5473 case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64;
5474 case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64;
5475 case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64;
5476 case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64;
5477 case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64;
5478 case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64;
5479 case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64;
5480 case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64;
5481 case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
5482 case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
5483 case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
5484 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5485 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5486 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5487 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5488 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5489 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5490 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5491 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5492 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5493 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5494 }
5496 "Unexpected scalar opcode without corresponding vector one!");
5497}
5498
5499// clang-format on
5500
5504 const DebugLoc &DL, Register Reg,
5505 bool IsSCCLive,
5506 SlotIndexes *Indexes) const {
5507 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5508 const SIInstrInfo *TII = ST.getInstrInfo();
5509 bool IsWave32 = ST.isWave32();
5510 if (IsSCCLive) {
5511 // Insert two move instructions, one to save the original value of EXEC and
5512 // the other to turn on all bits in EXEC. This is required as we can't use
5513 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5514 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5515 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5516 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5517 .addReg(Exec, RegState::Kill);
5518 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5519 if (Indexes) {
5520 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5521 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5522 }
5523 } else {
5524 const unsigned OrSaveExec =
5525 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5526 auto SaveExec =
5527 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5528 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5529 if (Indexes)
5530 Indexes->insertMachineInstrInMaps(*SaveExec);
5531 }
5532}
5533
5536 const DebugLoc &DL, Register Reg,
5537 SlotIndexes *Indexes) const {
5538 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5539 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5540 auto ExecRestoreMI =
5541 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5542 if (Indexes)
5543 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5544}
5545
5546static const TargetRegisterClass *
5548 const MachineRegisterInfo &MRI,
5549 const MCInstrDesc &TID, unsigned RCID,
5550 bool IsAllocatable) {
5551 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5552 (((TID.mayLoad() || TID.mayStore()) &&
5553 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5555 switch (RCID) {
5556 case AMDGPU::AV_32RegClassID:
5557 RCID = AMDGPU::VGPR_32RegClassID;
5558 break;
5559 case AMDGPU::AV_64RegClassID:
5560 RCID = AMDGPU::VReg_64RegClassID;
5561 break;
5562 case AMDGPU::AV_96RegClassID:
5563 RCID = AMDGPU::VReg_96RegClassID;
5564 break;
5565 case AMDGPU::AV_128RegClassID:
5566 RCID = AMDGPU::VReg_128RegClassID;
5567 break;
5568 case AMDGPU::AV_160RegClassID:
5569 RCID = AMDGPU::VReg_160RegClassID;
5570 break;
5571 case AMDGPU::AV_512RegClassID:
5572 RCID = AMDGPU::VReg_512RegClassID;
5573 break;
5574 default:
5575 break;
5576 }
5577 }
5578
5579 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5580}
5581
5583 unsigned OpNum, const TargetRegisterInfo *TRI,
5584 const MachineFunction &MF)
5585 const {
5586 if (OpNum >= TID.getNumOperands())
5587 return nullptr;
5588 auto RegClass = TID.operands()[OpNum].RegClass;
5589 bool IsAllocatable = false;
5591 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5592 // with two data operands. Request register class constrained to VGPR only
5593 // of both operands present as Machine Copy Propagation can not check this
5594 // constraint and possibly other passes too.
5595 //
5596 // The check is limited to FLAT and DS because atomics in non-flat encoding
5597 // have their vdst and vdata tied to be the same register.
5598 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5599 AMDGPU::OpName::vdst);
5600 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5601 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5602 : AMDGPU::OpName::vdata);
5603 if (DataIdx != -1) {
5604 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5605 TID.Opcode, AMDGPU::OpName::data1);
5606 }
5607 }
5608 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5609 IsAllocatable);
5610}
5611
5613 unsigned OpNo) const {
5614 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5615 const MCInstrDesc &Desc = get(MI.getOpcode());
5616 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5617 Desc.operands()[OpNo].RegClass == -1) {
5618 Register Reg = MI.getOperand(OpNo).getReg();
5619
5620 if (Reg.isVirtual())
5621 return MRI.getRegClass(Reg);
5622 return RI.getPhysRegBaseClass(Reg);
5623 }
5624
5625 unsigned RCID = Desc.operands()[OpNo].RegClass;
5626 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5627}
5628
5631 MachineBasicBlock *MBB = MI.getParent();
5632 MachineOperand &MO = MI.getOperand(OpIdx);
5634 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5635 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5636 unsigned Size = RI.getRegSizeInBits(*RC);
5637 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
5638 if (MO.isReg())
5639 Opcode = AMDGPU::COPY;
5640 else if (RI.isSGPRClass(RC))
5641 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5642
5643 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5644 Register Reg = MRI.createVirtualRegister(VRC);
5646 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5647 MO.ChangeToRegister(Reg, false);
5648}
5649
5652 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5653 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5654 MachineBasicBlock *MBB = MI->getParent();
5655 DebugLoc DL = MI->getDebugLoc();
5656 Register SubReg = MRI.createVirtualRegister(SubRC);
5657
5658 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
5659 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5660 .addReg(SuperReg.getReg(), 0, NewSubIdx);
5661 return SubReg;
5662}
5663
5666 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5667 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5668 if (Op.isImm()) {
5669 if (SubIdx == AMDGPU::sub0)
5670 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5671 if (SubIdx == AMDGPU::sub1)
5672 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5673
5674 llvm_unreachable("Unhandled register index for immediate");
5675 }
5676
5677 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5678 SubIdx, SubRC);
5679 return MachineOperand::CreateReg(SubReg, false);
5680}
5681
5682// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5683void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5684 assert(Inst.getNumExplicitOperands() == 3);
5685 MachineOperand Op1 = Inst.getOperand(1);
5686 Inst.removeOperand(1);
5687 Inst.addOperand(Op1);
5688}
5689
5691 const MCOperandInfo &OpInfo,
5692 const MachineOperand &MO) const {
5693 if (!MO.isReg())
5694 return false;
5695
5696 Register Reg = MO.getReg();
5697
5698 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5699 if (Reg.isPhysical())
5700 return DRC->contains(Reg);
5701
5702 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5703
5704 if (MO.getSubReg()) {
5705 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5706 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5707 if (!SuperRC)
5708 return false;
5709
5710 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5711 if (!DRC)
5712 return false;
5713 }
5714 return RC->hasSuperClassEq(DRC);
5715}
5716
5718 const MCOperandInfo &OpInfo,
5719 const MachineOperand &MO) const {
5720 if (MO.isReg())
5721 return isLegalRegOperand(MRI, OpInfo, MO);
5722
5723 // Handle non-register types that are treated like immediates.
5724 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5725 return true;
5726}
5727
5728bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5729 const MachineOperand *MO) const {
5730 const MachineFunction &MF = *MI.getParent()->getParent();
5731 const MachineRegisterInfo &MRI = MF.getRegInfo();
5732 const MCInstrDesc &InstDesc = MI.getDesc();
5733 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5734 const TargetRegisterClass *DefinedRC =
5735 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5736 if (!MO)
5737 MO = &MI.getOperand(OpIdx);
5738
5739 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5740 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5741 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5742 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5743 return false;
5744
5746 if (MO->isReg())
5747 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5748
5749 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5750 if (i == OpIdx)
5751 continue;
5752 const MachineOperand &Op = MI.getOperand(i);
5753 if (Op.isReg()) {
5754 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5755 if (!SGPRsUsed.count(SGPR) &&
5756 // FIXME: This can access off the end of the operands() array.
5757 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5758 if (--ConstantBusLimit <= 0)
5759 return false;
5760 SGPRsUsed.insert(SGPR);
5761 }
5762 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5763 !isInlineConstant(Op, InstDesc.operands()[i])) {
5764 if (!LiteralLimit--)
5765 return false;
5766 if (--ConstantBusLimit <= 0)
5767 return false;
5768 }
5769 }
5770 }
5771
5772 if (MO->isReg()) {
5773 if (!DefinedRC)
5774 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5775 if (!isLegalRegOperand(MRI, OpInfo, *MO))
5776 return false;
5777 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5778 if (IsAGPR && !ST.hasMAIInsts())
5779 return false;
5780 unsigned Opc = MI.getOpcode();
5781 if (IsAGPR &&
5782 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5783 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5784 return false;
5785 // Atomics should have both vdst and vdata either vgpr or agpr.
5786 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5787 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
5788 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5789 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5790 MI.getOperand(DataIdx).isReg() &&
5791 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5792 return false;
5793 if ((int)OpIdx == DataIdx) {
5794 if (VDstIdx != -1 &&
5795 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5796 return false;
5797 // DS instructions with 2 src operands also must have tied RC.
5798 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5799 AMDGPU::OpName::data1);
5800 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5801 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5802 return false;
5803 }
5804 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5805 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5806 RI.isSGPRReg(MRI, MO->getReg()))
5807 return false;
5808 return true;
5809 }
5810
5811 if (MO->isImm()) {
5812 uint64_t Imm = MO->getImm();
5813 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5814 bool Is64BitOp = Is64BitFPOp ||
5818 if (Is64BitOp &&
5820 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
5821 return false;
5822
5823 // FIXME: We can use sign extended 64-bit literals, but only for signed
5824 // operands. At the moment we do not know if an operand is signed.
5825 // Such operand will be encoded as its low 32 bits and then either
5826 // correctly sign extended or incorrectly zero extended by HW.
5827 if (!Is64BitFPOp && (int32_t)Imm < 0)
5828 return false;
5829 }
5830 }
5831
5832 // Handle non-register types that are treated like immediates.
5833 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
5834
5835 if (!DefinedRC) {
5836 // This operand expects an immediate.
5837 return true;
5838 }
5839
5840 return isImmOperandLegal(MI, OpIdx, *MO);
5841}
5842
5844 MachineInstr &MI) const {
5845 unsigned Opc = MI.getOpcode();
5846 const MCInstrDesc &InstrDesc = get(Opc);
5847
5848 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5849 MachineOperand &Src0 = MI.getOperand(Src0Idx);
5850
5851 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5852 MachineOperand &Src1 = MI.getOperand(Src1Idx);
5853
5854 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5855 // we need to only have one constant bus use before GFX10.
5856 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
5857 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
5858 RI.isSGPRReg(MRI, Src0.getReg()))
5859 legalizeOpWithMove(MI, Src0Idx);
5860
5861 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5862 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5863 // src0/src1 with V_READFIRSTLANE.
5864 if (Opc == AMDGPU::V_WRITELANE_B32) {
5865 const DebugLoc &DL = MI.getDebugLoc();
5866 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
5867 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5868 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5869 .add(Src0);
5870 Src0.ChangeToRegister(Reg, false);
5871 }
5872 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
5873 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5874 const DebugLoc &DL = MI.getDebugLoc();
5875 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5876 .add(Src1);
5877 Src1.ChangeToRegister(Reg, false);
5878 }
5879 return;
5880 }
5881
5882 // No VOP2 instructions support AGPRs.
5883 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
5884 legalizeOpWithMove(MI, Src0Idx);
5885
5886 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
5887 legalizeOpWithMove(MI, Src1Idx);
5888
5889 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
5890 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
5891 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5892 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
5893 legalizeOpWithMove(MI, Src2Idx);
5894 }
5895
5896 // VOP2 src0 instructions support all operand types, so we don't need to check
5897 // their legality. If src1 is already legal, we don't need to do anything.
5898 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
5899 return;
5900
5901 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
5902 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
5903 // select is uniform.
5904 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
5905 RI.isVGPR(MRI, Src1.getReg())) {
5906 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5907 const DebugLoc &DL = MI.getDebugLoc();
5908 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5909 .add(Src1);
5910 Src1.ChangeToRegister(Reg, false);
5911 return;
5912 }
5913
5914 // We do not use commuteInstruction here because it is too aggressive and will
5915 // commute if it is possible. We only want to commute here if it improves
5916 // legality. This can be called a fairly large number of times so don't waste
5917 // compile time pointlessly swapping and checking legality again.
5918 if (HasImplicitSGPR || !MI.isCommutable()) {
5919 legalizeOpWithMove(MI, Src1Idx);
5920 return;
5921 }
5922
5923 // If src0 can be used as src1, commuting will make the operands legal.
5924 // Otherwise we have to give up and insert a move.
5925 //
5926 // TODO: Other immediate-like operand kinds could be commuted if there was a
5927 // MachineOperand::ChangeTo* for them.
5928 if ((!Src1.isImm() && !Src1.isReg()) ||
5929 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
5930 legalizeOpWithMove(MI, Src1Idx);
5931 return;
5932 }
5933
5934 int CommutedOpc = commuteOpcode(MI);
5935 if (CommutedOpc == -1) {
5936 legalizeOpWithMove(MI, Src1Idx);
5937 return;
5938 }
5939
5940 MI.setDesc(get(CommutedOpc));
5941
5942 Register Src0Reg = Src0.getReg();
5943 unsigned Src0SubReg = Src0.getSubReg();
5944 bool Src0Kill = Src0.isKill();
5945
5946 if (Src1.isImm())
5947 Src0.ChangeToImmediate(Src1.getImm());
5948 else if (Src1.isReg()) {
5949 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
5950 Src0.setSubReg(Src1.getSubReg());
5951 } else
5952 llvm_unreachable("Should only have register or immediate operands");
5953
5954 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
5955 Src1.setSubReg(Src0SubReg);
5957}
5958
5959// Legalize VOP3 operands. All operand types are supported for any operand
5960// but only one literal constant and only starting from GFX10.
5962 MachineInstr &MI) const {
5963 unsigned Opc = MI.getOpcode();
5964
5965 int VOP3Idx[3] = {
5966 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
5967 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
5968 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
5969 };
5970
5971 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
5972 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
5973 // src1 and src2 must be scalar
5974 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
5975 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
5976 const DebugLoc &DL = MI.getDebugLoc();
5977 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
5978 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5979 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5980 .add(Src1);
5981 Src1.ChangeToRegister(Reg, false);
5982 }
5983 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
5984 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5985 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5986 .add(Src2);
5987 Src2.ChangeToRegister(Reg, false);
5988 }
5989 }
5990
5991 // Find the one SGPR operand we are allowed to use.
5992 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
5993 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
5994 SmallDenseSet<unsigned> SGPRsUsed;
5995 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
5996 if (SGPRReg) {
5997 SGPRsUsed.insert(SGPRReg);
5998 --ConstantBusLimit;
5999 }
6000
6001 for (int Idx : VOP3Idx) {
6002 if (Idx == -1)
6003 break;
6004 MachineOperand &MO = MI.getOperand(Idx);
6005
6006 if (!MO.isReg()) {
6007 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6008 continue;
6009
6010 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6011 --LiteralLimit;
6012 --ConstantBusLimit;
6013 continue;
6014 }
6015
6016 --LiteralLimit;
6017 --ConstantBusLimit;
6019 continue;
6020 }
6021
6022 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6023 !isOperandLegal(MI, Idx, &MO)) {
6025 continue;
6026 }
6027
6028 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6029 continue; // VGPRs are legal
6030
6031 // We can use one SGPR in each VOP3 instruction prior to GFX10
6032 // and two starting from GFX10.
6033 if (SGPRsUsed.count(MO.getReg()))
6034 continue;
6035 if (ConstantBusLimit > 0) {
6036 SGPRsUsed.insert(MO.getReg());
6037 --ConstantBusLimit;
6038 continue;
6039 }
6040
6041 // If we make it this far, then the operand is not legal and we must
6042 // legalize it.
6044 }
6045
6046 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6047 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6048 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6049 legalizeOpWithMove(MI, VOP3Idx[2]);
6050}
6051
6053 MachineRegisterInfo &MRI) const {
6054 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6055 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6056 Register DstReg = MRI.createVirtualRegister(SRC);
6057 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6058
6059 if (RI.hasAGPRs(VRC)) {
6060 VRC = RI.getEquivalentVGPRClass(VRC);
6061 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6062 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6063 get(TargetOpcode::COPY), NewSrcReg)
6064 .addReg(SrcReg);
6065 SrcReg = NewSrcReg;
6066 }
6067
6068 if (SubRegs == 1) {
6069 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6070 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6071 .addReg(SrcReg);
6072 return DstReg;
6073 }
6074
6076 for (unsigned i = 0; i < SubRegs; ++i) {
6077 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6078 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6079 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6080 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6081 SRegs.push_back(SGPR);
6082 }
6083
6085 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6086 get(AMDGPU::REG_SEQUENCE), DstReg);
6087 for (unsigned i = 0; i < SubRegs; ++i) {
6088 MIB.addReg(SRegs[i]);
6089 MIB.addImm(RI.getSubRegFromChannel(i));
6090 }
6091 return DstReg;
6092}
6093
6095 MachineInstr &MI) const {
6096
6097 // If the pointer is store in VGPRs, then we need to move them to
6098 // SGPRs using v_readfirstlane. This is safe because we only select
6099 // loads with uniform pointers to SMRD instruction so we know the
6100 // pointer value is uniform.
6101 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6102 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6103 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6104 SBase->setReg(SGPR);
6105 }
6106 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6107 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
6108 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6109 SOff->setReg(SGPR);
6110 }
6111}
6112
6114 unsigned Opc = Inst.getOpcode();
6115 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6116 if (OldSAddrIdx < 0)
6117 return false;
6118
6120
6121 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6122 if (NewOpc < 0)
6124 if (NewOpc < 0)
6125 return false;
6126
6128 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6129 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6130 return false;
6131
6132 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6133 if (NewVAddrIdx < 0)
6134 return false;
6135
6136 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6137
6138 // Check vaddr, it shall be zero or absent.
6139 MachineInstr *VAddrDef = nullptr;
6140 if (OldVAddrIdx >= 0) {
6141 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6142 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6143 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6144 !VAddrDef->getOperand(1).isImm() ||
6145 VAddrDef->getOperand(1).getImm() != 0)
6146 return false;
6147 }
6148
6149 const MCInstrDesc &NewDesc = get(NewOpc);
6150 Inst.setDesc(NewDesc);
6151
6152 // Callers expect iterator to be valid after this call, so modify the
6153 // instruction in place.
6154 if (OldVAddrIdx == NewVAddrIdx) {
6155 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6156 // Clear use list from the old vaddr holding a zero register.
6157 MRI.removeRegOperandFromUseList(&NewVAddr);
6158 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6159 Inst.removeOperand(OldSAddrIdx);
6160 // Update the use list with the pointer we have just moved from vaddr to
6161 // saddr position. Otherwise new vaddr will be missing from the use list.
6162 MRI.removeRegOperandFromUseList(&NewVAddr);
6163 MRI.addRegOperandToUseList(&NewVAddr);
6164 } else {
6165 assert(OldSAddrIdx == NewVAddrIdx);
6166
6167 if (OldVAddrIdx >= 0) {
6168 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6169 AMDGPU::OpName::vdst_in);
6170
6171 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6172 // it asserts. Untie the operands for now and retie them afterwards.
6173 if (NewVDstIn != -1) {
6174 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6175 Inst.untieRegOperand(OldVDstIn);
6176 }
6177
6178 Inst.removeOperand(OldVAddrIdx);
6179
6180 if (NewVDstIn != -1) {
6181 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6182 Inst.tieOperands(NewVDst, NewVDstIn);
6183 }
6184 }
6185 }
6186
6187 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6188 VAddrDef->eraseFromParent();
6189
6190 return true;
6191}
6192
6193// FIXME: Remove this when SelectionDAG is obsoleted.
6195 MachineInstr &MI) const {
6197 return;
6198
6199 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6200 // thinks they are uniform, so a readfirstlane should be valid.
6201 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6202 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6203 return;
6204
6206 return;
6207
6208 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
6209 SAddr->setReg(ToSGPR);
6210}
6211
6214 const TargetRegisterClass *DstRC,
6217 const DebugLoc &DL) const {
6218 Register OpReg = Op.getReg();
6219 unsigned OpSubReg = Op.getSubReg();
6220
6221 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6222 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6223
6224 // Check if operand is already the correct register class.
6225 if (DstRC == OpRC)
6226 return;
6227
6228 Register DstReg = MRI.createVirtualRegister(DstRC);
6229 auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
6230
6231 Op.setReg(DstReg);
6232 Op.setSubReg(0);
6233
6234 MachineInstr *Def = MRI.getVRegDef(OpReg);
6235 if (!Def)
6236 return;
6237
6238 // Try to eliminate the copy if it is copying an immediate value.
6239 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6240 foldImmediate(*Copy, *Def, OpReg, &MRI);
6241
6242 bool ImpDef = Def->isImplicitDef();
6243 while (!ImpDef && Def && Def->isCopy()) {
6244 if (Def->getOperand(1).getReg().isPhysical())
6245 break;
6246 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6247 ImpDef = Def && Def->isImplicitDef();
6248 }
6249 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6250 !ImpDef)
6251 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6252}
6253
6254// Emit the actual waterfall loop, executing the wrapped instruction for each
6255// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6256// iteration, in the worst case we execute 64 (once per lane).
6259 MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL,
6260 ArrayRef<MachineOperand *> ScalarOps) {
6261 MachineFunction &MF = *OrigBB.getParent();
6262 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6263 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6264 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6265 unsigned SaveExecOpc =
6266 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6267 unsigned XorTermOpc =
6268 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6269 unsigned AndOpc =
6270 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6271 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6272
6274
6275 SmallVector<Register, 8> ReadlanePieces;
6276 Register CondReg;
6277
6278 for (MachineOperand *ScalarOp : ScalarOps) {
6279 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6280 unsigned NumSubRegs = RegSize / 32;
6281 Register VScalarOp = ScalarOp->getReg();
6282
6283 if (NumSubRegs == 1) {
6284 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6285
6286 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6287 .addReg(VScalarOp);
6288
6289 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6290
6291 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6292 .addReg(CurReg)
6293 .addReg(VScalarOp);
6294
6295 // Combine the comparison results with AND.
6296 if (!CondReg) // First.
6297 CondReg = NewCondReg;
6298 else { // If not the first, we create an AND.
6299 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6300 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6301 .addReg(CondReg)
6302 .addReg(NewCondReg);
6303 CondReg = AndReg;
6304 }
6305
6306 // Update ScalarOp operand to use the SGPR ScalarOp.
6307 ScalarOp->setReg(CurReg);
6308 ScalarOp->setIsKill();
6309 } else {
6310 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6311 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6312 "Unhandled register size");
6313
6314 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6315 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6316 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6317
6318 // Read the next variant <- also loop target.
6319 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6320 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6321
6322 // Read the next variant <- also loop target.
6323 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6324 .addReg(VScalarOp, VScalarOpUndef,
6325 TRI->getSubRegFromChannel(Idx + 1));
6326
6327 ReadlanePieces.push_back(CurRegLo);
6328 ReadlanePieces.push_back(CurRegHi);
6329
6330 // Comparison is to be done as 64-bit.
6331 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6332 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6333 .addReg(CurRegLo)
6334 .addImm(AMDGPU::sub0)
6335 .addReg(CurRegHi)
6336 .addImm(AMDGPU::sub1);
6337
6338 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6339 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6340 NewCondReg)
6341 .addReg(CurReg);
6342 if (NumSubRegs <= 2)
6343 Cmp.addReg(VScalarOp);
6344 else
6345 Cmp.addReg(VScalarOp, VScalarOpUndef,
6346 TRI->getSubRegFromChannel(Idx, 2));
6347
6348 // Combine the comparison results with AND.
6349 if (!CondReg) // First.
6350 CondReg = NewCondReg;
6351 else { // If not the first, we create an AND.
6352 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6353 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6354 .addReg(CondReg)
6355 .addReg(NewCondReg);
6356 CondReg = AndReg;
6357 }
6358 } // End for loop.
6359
6360 auto SScalarOpRC =
6361 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6362 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6363
6364 // Build scalar ScalarOp.
6365 auto Merge =
6366 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6367 unsigned Channel = 0;
6368 for (Register Piece : ReadlanePieces) {
6369 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6370 }
6371
6372 // Update ScalarOp operand to use the SGPR ScalarOp.
6373 ScalarOp->setReg(SScalarOp);
6374 ScalarOp->setIsKill();
6375 }
6376 }
6377
6378 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6379 MRI.setSimpleHint(SaveExec, CondReg);
6380
6381 // Update EXEC to matching lanes, saving original to SaveExec.
6382 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6383 .addReg(CondReg, RegState::Kill);
6384
6385 // The original instruction is here; we insert the terminators after it.
6386 I = BodyBB.end();
6387
6388 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6389 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6390 .addReg(Exec)
6391 .addReg(SaveExec);
6392
6393 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6394}
6395
6396// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6397// with SGPRs by iterating over all unique values across all lanes.
6398// Returns the loop basic block that now contains \p MI.
6399static MachineBasicBlock *
6403 MachineBasicBlock::iterator Begin = nullptr,
6404 MachineBasicBlock::iterator End = nullptr) {
6405 MachineBasicBlock &MBB = *MI.getParent();
6406 MachineFunction &MF = *MBB.getParent();
6407 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6408 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6410 if (!Begin.isValid())
6411 Begin = &MI;
6412 if (!End.isValid()) {
6413 End = &MI;
6414 ++End;
6415 }
6416 const DebugLoc &DL = MI.getDebugLoc();
6417 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6418 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6419 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6420
6421 // Save SCC. Waterfall Loop may overwrite SCC.
6422 Register SaveSCCReg;
6423
6424 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6425 // rather than unlimited scan everywhere
6426 bool SCCNotDead =
6427 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6428 std::numeric_limits<unsigned>::max()) !=
6430 if (SCCNotDead) {
6431 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6432 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6433 .addImm(1)
6434 .addImm(0);
6435 }
6436
6437 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6438
6439 // Save the EXEC mask
6440 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6441
6442 // Killed uses in the instruction we are waterfalling around will be
6443 // incorrect due to the added control-flow.
6445 ++AfterMI;
6446 for (auto I = Begin; I != AfterMI; I++) {
6447 for (auto &MO : I->all_uses())
6448 MRI.clearKillFlags(MO.getReg());
6449 }
6450
6451 // To insert the loop we need to split the block. Move everything after this
6452 // point to a new block, and insert a new empty block between the two.
6455 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6457 ++MBBI;
6458
6459 MF.insert(MBBI, LoopBB);
6460 MF.insert(MBBI, BodyBB);
6461 MF.insert(MBBI, RemainderBB);
6462
6463 LoopBB->addSuccessor(BodyBB);
6464 BodyBB->addSuccessor(LoopBB);
6465 BodyBB->addSuccessor(RemainderBB);
6466
6467 // Move Begin to MI to the BodyBB, and the remainder of the block to
6468 // RemainderBB.
6469 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6470 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6471 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6472
6473 MBB.addSuccessor(LoopBB);
6474
6475 // Update dominators. We know that MBB immediately dominates LoopBB, that
6476 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6477 // RemainderBB. RemainderBB immediately dominates all of the successors
6478 // transferred to it from MBB that MBB used to properly dominate.
6479 if (MDT) {
6480 MDT->addNewBlock(LoopBB, &MBB);
6481 MDT->addNewBlock(BodyBB, LoopBB);
6482 MDT->addNewBlock(RemainderBB, BodyBB);
6483 for (auto &Succ : RemainderBB->successors()) {
6484 if (MDT->properlyDominates(&MBB, Succ)) {
6485 MDT->changeImmediateDominator(Succ, RemainderBB);
6486 }
6487 }
6488 }
6489
6490 emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps);
6491
6492 MachineBasicBlock::iterator First = RemainderBB->begin();
6493 // Restore SCC
6494 if (SCCNotDead) {
6495 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6496 .addReg(SaveSCCReg, RegState::Kill)
6497 .addImm(0);
6498 }
6499
6500 // Restore the EXEC mask
6501 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6502 return BodyBB;
6503}
6504
6505// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6506static std::tuple<unsigned, unsigned>
6508 MachineBasicBlock &MBB = *MI.getParent();
6509 MachineFunction &MF = *MBB.getParent();
6511
6512 // Extract the ptr from the resource descriptor.
6513 unsigned RsrcPtr =
6514 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6515 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6516
6517 // Create an empty resource descriptor
6518 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6519 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6520 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6521 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6522 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6523
6524 // Zero64 = 0
6525 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6526 .addImm(0);
6527
6528 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6529 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6530 .addImm(RsrcDataFormat & 0xFFFFFFFF);
6531
6532 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6533 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6534 .addImm(RsrcDataFormat >> 32);
6535
6536 // NewSRsrc = {Zero64, SRsrcFormat}
6537 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6538 .addReg(Zero64)
6539 .addImm(AMDGPU::sub0_sub1)
6540 .addReg(SRsrcFormatLo)
6541 .addImm(AMDGPU::sub2)
6542 .addReg(SRsrcFormatHi)
6543 .addImm(AMDGPU::sub3);
6544
6545 return std::tuple(RsrcPtr, NewSRsrc);
6546}
6547
6550 MachineDominatorTree *MDT) const {
6551 MachineFunction &MF = *MI.getParent()->getParent();
6553 MachineBasicBlock *CreatedBB = nullptr;
6554
6555 // Legalize VOP2
6556 if (isVOP2(MI) || isVOPC(MI)) {
6558 return CreatedBB;
6559 }
6560
6561 // Legalize VOP3
6562 if (isVOP3(MI)) {
6564 return CreatedBB;
6565 }
6566
6567 // Legalize SMRD
6568 if (isSMRD(MI)) {
6570 return CreatedBB;
6571 }
6572
6573 // Legalize FLAT
6574 if (isFLAT(MI)) {
6576 return CreatedBB;
6577 }
6578
6579 // Legalize REG_SEQUENCE and PHI
6580 // The register class of the operands much be the same type as the register
6581 // class of the output.
6582 if (MI.getOpcode() == AMDGPU::PHI) {
6583 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6584 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6585 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6586 continue;
6587 const TargetRegisterClass *OpRC =
6588 MRI.getRegClass(MI.getOperand(i).getReg());
6589 if (RI.hasVectorRegisters(OpRC)) {
6590 VRC = OpRC;
6591 } else {
6592 SRC = OpRC;
6593 }
6594 }
6595
6596 // If any of the operands are VGPR registers, then they all most be
6597 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6598 // them.
6599 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6600 if (!VRC) {
6601 assert(SRC);
6602 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6603 VRC = &AMDGPU::VReg_1RegClass;
6604 } else
6605 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6606 ? RI.getEquivalentAGPRClass(SRC)
6607 : RI.getEquivalentVGPRClass(SRC);
6608 } else {
6609 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6610 ? RI.getEquivalentAGPRClass(VRC)
6611 : RI.getEquivalentVGPRClass(VRC);
6612 }
6613 RC = VRC;
6614 } else {
6615 RC = SRC;
6616 }
6617
6618 // Update all the operands so they have the same type.
6619 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6620 MachineOperand &Op = MI.getOperand(I);
6621 if (!Op.isReg() || !Op.getReg().isVirtual())
6622 continue;
6623
6624 // MI is a PHI instruction.
6625 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6627
6628 // Avoid creating no-op copies with the same src and dst reg class. These
6629 // confuse some of the machine passes.
6630 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6631 }
6632 }
6633
6634 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6635 // VGPR dest type and SGPR sources, insert copies so all operands are
6636 // VGPRs. This seems to help operand folding / the register coalescer.
6637 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6638 MachineBasicBlock *MBB = MI.getParent();
6639 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6640 if (RI.hasVGPRs(DstRC)) {
6641 // Update all the operands so they are VGPR register classes. These may
6642 // not be the same register class because REG_SEQUENCE supports mixing
6643 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6644 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6645 MachineOperand &Op = MI.getOperand(I);
6646 if (!Op.isReg() || !Op.getReg().isVirtual())
6647 continue;
6648
6649 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6650 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6651 if (VRC == OpRC)
6652 continue;
6653
6654 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6655 Op.setIsKill();
6656 }
6657 }
6658
6659 return CreatedBB;
6660 }
6661
6662 // Legalize INSERT_SUBREG
6663 // src0 must have the same register class as dst
6664 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6665 Register Dst = MI.getOperand(0).getReg();
6666 Register Src0 = MI.getOperand(1).getReg();
6667 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6668 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6669 if (DstRC != Src0RC) {
6670 MachineBasicBlock *MBB = MI.getParent();
6671 MachineOperand &Op = MI.getOperand(1);
6672 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6673 }
6674 return CreatedBB;
6675 }
6676
6677 // Legalize SI_INIT_M0
6678 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6679 MachineOperand &Src = MI.getOperand(0);
6680 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6681 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6682 return CreatedBB;
6683 }
6684
6685 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6686 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6687 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6688 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6689 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6690 MI.getOpcode() == AMDGPU::S_WQM_B64) {
6691 MachineOperand &Src = MI.getOperand(1);
6692 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6693 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6694 return CreatedBB;
6695 }
6696
6697 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6698 //
6699 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6700 // scratch memory access. In both cases, the legalization never involves
6701 // conversion to the addr64 form.
6703 (isMUBUF(MI) || isMTBUF(MI)))) {
6704 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6705 : AMDGPU::OpName::srsrc;
6706 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6707 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6708 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6709
6710 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6711 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6712 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6713 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6714
6715 return CreatedBB;
6716 }
6717
6718 // Legalize SI_CALL
6719 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6720 MachineOperand *Dest = &MI.getOperand(0);
6721 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6722 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6723 // following copies, we also need to move copies from and to physical
6724 // registers into the loop block.
6725 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6726 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6727
6728 // Also move the copies to physical registers into the loop block
6729 MachineBasicBlock &MBB = *MI.getParent();
6731 while (Start->getOpcode() != FrameSetupOpcode)
6732 --Start;
6734 while (End->getOpcode() != FrameDestroyOpcode)
6735 ++End;
6736 // Also include following copies of the return value
6737 ++End;
6738 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6739 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
6740 ++End;
6741 CreatedBB =
6742 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6743 }
6744 }
6745
6746 // Legalize s_sleep_var.
6747 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6748 const DebugLoc &DL = MI.getDebugLoc();
6749 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6750 int Src0Idx =
6751 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6752 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6753 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6754 .add(Src0);
6755 Src0.ChangeToRegister(Reg, false);
6756 return nullptr;
6757 }
6758
6759 // Legalize MUBUF instructions.
6760 bool isSoffsetLegal = true;
6761 int SoffsetIdx =
6762 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6763 if (SoffsetIdx != -1) {
6764 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6765 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6766 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6767 isSoffsetLegal = false;
6768 }
6769 }
6770
6771 bool isRsrcLegal = true;
6772 int RsrcIdx =
6773 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6774 if (RsrcIdx != -1) {
6775 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6776 if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
6777 isRsrcLegal = false;
6778 }
6779 }
6780
6781 // The operands are legal.
6782 if (isRsrcLegal && isSoffsetLegal)
6783 return CreatedBB;
6784
6785 if (!isRsrcLegal) {
6786 // Legalize a VGPR Rsrc
6787 //
6788 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6789 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6790 // a zero-value SRsrc.
6791 //
6792 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6793 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6794 // above.
6795 //
6796 // Otherwise we are on non-ADDR64 hardware, and/or we have
6797 // idxen/offen/bothen and we fall back to a waterfall loop.
6798
6799 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6800 MachineBasicBlock &MBB = *MI.getParent();
6801
6802 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6803 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6804 // This is already an ADDR64 instruction so we need to add the pointer
6805 // extracted from the resource descriptor to the current value of VAddr.
6806 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6807 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6808 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6809
6810 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6811 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
6812 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
6813
6814 unsigned RsrcPtr, NewSRsrc;
6815 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6816
6817 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
6818 const DebugLoc &DL = MI.getDebugLoc();
6819 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
6820 .addDef(CondReg0)
6821 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6822 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
6823 .addImm(0);
6824
6825 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
6826 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
6827 .addDef(CondReg1, RegState::Dead)
6828 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6829 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
6830 .addReg(CondReg0, RegState::Kill)
6831 .addImm(0);
6832
6833 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6834 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
6835 .addReg(NewVAddrLo)
6836 .addImm(AMDGPU::sub0)
6837 .addReg(NewVAddrHi)
6838 .addImm(AMDGPU::sub1);
6839
6840 VAddr->setReg(NewVAddr);
6841 Rsrc->setReg(NewSRsrc);
6842 } else if (!VAddr && ST.hasAddr64()) {
6843 // This instructions is the _OFFSET variant, so we need to convert it to
6844 // ADDR64.
6846 "FIXME: Need to emit flat atomics here");
6847
6848 unsigned RsrcPtr, NewSRsrc;
6849 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6850
6851 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6852 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
6853 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
6854 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6855 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
6856
6857 // Atomics with return have an additional tied operand and are
6858 // missing some of the special bits.
6859 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
6860 MachineInstr *Addr64;
6861
6862 if (!VDataIn) {
6863 // Regular buffer load / store.
6865 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6866 .add(*VData)
6867 .addReg(NewVAddr)
6868 .addReg(NewSRsrc)
6869 .add(*SOffset)
6870 .add(*Offset);
6871
6872 if (const MachineOperand *CPol =
6873 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6874 MIB.addImm(CPol->getImm());
6875 }
6876
6877 if (const MachineOperand *TFE =
6878 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
6879 MIB.addImm(TFE->getImm());
6880 }
6881
6882 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
6883
6884 MIB.cloneMemRefs(MI);
6885 Addr64 = MIB;
6886 } else {
6887 // Atomics with return.
6888 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6889 .add(*VData)
6890 .add(*VDataIn)
6891 .addReg(NewVAddr)
6892 .addReg(NewSRsrc)
6893 .add(*SOffset)
6894 .add(*Offset)
6895 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
6896 .cloneMemRefs(MI);
6897 }
6898
6899 MI.removeFromParent();
6900
6901 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6902 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
6903 NewVAddr)
6904 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6905 .addImm(AMDGPU::sub0)
6906 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6907 .addImm(AMDGPU::sub1);
6908 } else {
6909 // Legalize a VGPR Rsrc and soffset together.
6910 if (!isSoffsetLegal) {
6911 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6912 CreatedBB =
6913 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
6914 return CreatedBB;
6915 }
6916 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
6917 return CreatedBB;
6918 }
6919 }
6920
6921 // Legalize a VGPR soffset.
6922 if (!isSoffsetLegal) {
6923 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6924 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
6925 return CreatedBB;
6926 }
6927 return CreatedBB;
6928}
6929
6931 InstrList.insert(MI);
6932 // Add MBUF instructiosn to deferred list.
6933 int RsrcIdx =
6934 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
6935 if (RsrcIdx != -1) {
6936 DeferredList.insert(MI);
6937 }
6938}
6939
6941 return DeferredList.contains(MI);
6942}
6943
6945 MachineDominatorTree *MDT) const {
6946
6947 while (!Worklist.empty()) {
6948 MachineInstr &Inst = *Worklist.top();
6949 Worklist.erase_top();
6950 // Skip MachineInstr in the deferred list.
6951 if (Worklist.isDeferred(&Inst))
6952 continue;
6953 moveToVALUImpl(Worklist, MDT, Inst);
6954 }
6955
6956 // Deferred list of instructions will be processed once
6957 // all the MachineInstr in the worklist are done.
6958 for (MachineInstr *Inst : Worklist.getDeferredList()) {
6959 moveToVALUImpl(Worklist, MDT, *Inst);
6960 assert(Worklist.empty() &&
6961 "Deferred MachineInstr are not supposed to re-populate worklist");
6962 }
6963}
6964
6967 MachineInstr &Inst) const {
6968
6970 if (!MBB)
6971 return;
6973 unsigned Opcode = Inst.getOpcode();
6974 unsigned NewOpcode = getVALUOp(Inst);
6975 // Handle some special cases
6976 switch (Opcode) {
6977 default:
6978 break;
6979 case AMDGPU::S_ADD_U64_PSEUDO:
6980 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
6981 break;
6982 case AMDGPU::S_SUB_U64_PSEUDO:
6983 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
6984 break;
6985 case AMDGPU::S_ADD_I32:
6986 case AMDGPU::S_SUB_I32: {
6987 // FIXME: The u32 versions currently selected use the carry.
6988 bool Changed;
6989 MachineBasicBlock *CreatedBBTmp = nullptr;
6990 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
6991 if (Changed)
6992 return;
6993
6994 // Default handling
6995 break;
6996 }
6997
6998 case AMDGPU::S_MUL_U64:
6999 // Split s_mul_u64 in 32-bit vector multiplications.
7000 splitScalarSMulU64(Worklist, Inst, MDT);
7001 Inst.eraseFromParent();
7002 return;
7003
7004 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7005 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7006 // This is a special case of s_mul_u64 where all the operands are either
7007 // zero extended or sign extended.
7008 splitScalarSMulPseudo(Worklist, Inst, MDT);
7009 Inst.eraseFromParent();
7010 return;
7011
7012 case AMDGPU::S_AND_B64:
7013 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7014 Inst.eraseFromParent();
7015 return;
7016
7017 case AMDGPU::S_OR_B64:
7018 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7019 Inst.eraseFromParent();
7020 return;
7021
7022 case AMDGPU::S_XOR_B64:
7023 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7024 Inst.eraseFromParent();
7025 return;
7026
7027 case AMDGPU::S_NAND_B64:
7028 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7029 Inst.eraseFromParent();
7030 return;
7031
7032 case AMDGPU::S_NOR_B64:
7033 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7034 Inst.eraseFromParent();
7035 return;
7036
7037 case AMDGPU::S_XNOR_B64:
7038 if (ST.hasDLInsts())
7039 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7040 else
7041 splitScalar64BitXnor(Worklist, Inst, MDT);
7042 Inst.eraseFromParent();
7043 return;
7044
7045 case AMDGPU::S_ANDN2_B64:
7046 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7047 Inst.eraseFromParent();
7048 return;
7049
7050 case AMDGPU::S_ORN2_B64:
7051 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7052 Inst.eraseFromParent();
7053 return;
7054
7055 case AMDGPU::S_BREV_B64:
7056 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7057 Inst.eraseFromParent();
7058 return;
7059
7060 case AMDGPU::S_NOT_B64:
7061 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7062 Inst.eraseFromParent();
7063 return;
7064
7065 case AMDGPU::S_BCNT1_I32_B64:
7066 splitScalar64BitBCNT(Worklist, Inst);
7067 Inst.eraseFromParent();
7068 return;
7069
7070 case AMDGPU::S_BFE_I64:
7071 splitScalar64BitBFE(Worklist, Inst);
7072 Inst.eraseFromParent();
7073 return;
7074
7075 case AMDGPU::S_FLBIT_I32_B64:
7076 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7077 Inst.eraseFromParent();
7078 return;
7079 case AMDGPU::S_FF1_I32_B64:
7080 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7081 Inst.eraseFromParent();
7082 return;
7083
7084 case AMDGPU::S_LSHL_B32:
7085 if (ST.hasOnlyRevVALUShifts()) {
7086 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7087 swapOperands(Inst);
7088 }
7089 break;
7090 case AMDGPU::S_ASHR_I32:
7091 if (ST.hasOnlyRevVALUShifts()) {
7092 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7093 swapOperands(Inst);
7094 }
7095 break;
7096 case AMDGPU::S_LSHR_B32:
7097 if (ST.hasOnlyRevVALUShifts()) {
7098 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7099 swapOperands(Inst);
7100 }
7101 break;
7102 case AMDGPU::S_LSHL_B64:
7103 if (ST.hasOnlyRevVALUShifts()) {
7104 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7105 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7106 : AMDGPU::V_LSHLREV_B64_e64;
7107 swapOperands(Inst);
7108 }
7109 break;
7110 case AMDGPU::S_ASHR_I64:
7111 if (ST.hasOnlyRevVALUShifts()) {
7112 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7113 swapOperands(Inst);
7114 }
7115 break;
7116 case AMDGPU::S_LSHR_B64:
7117 if (ST.hasOnlyRevVALUShifts()) {
7118 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7119 swapOperands(Inst);
7120 }
7121 break;
7122
7123 case AMDGPU::S_ABS_I32:
7124 lowerScalarAbs(Worklist, Inst);
7125 Inst.eraseFromParent();
7126 return;
7127
7128 case AMDGPU::S_CBRANCH_SCC0:
7129 case AMDGPU::S_CBRANCH_SCC1: {
7130 // Clear unused bits of vcc
7131 Register CondReg = Inst.getOperand(1).getReg();
7132 bool IsSCC = CondReg == AMDGPU::SCC;
7133 Register VCC = RI.getVCC();
7134 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7135 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7136 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7137 .addReg(EXEC)
7138 .addReg(IsSCC ? VCC : CondReg);
7139 Inst.removeOperand(1);
7140 } break;
7141
7142 case AMDGPU::S_BFE_U64:
7143 case AMDGPU::S_BFM_B64:
7144 llvm_unreachable("Moving this op to VALU not implemented");
7145
7146 case AMDGPU::S_PACK_LL_B32_B16:
7147 case AMDGPU::S_PACK_LH_B32_B16:
7148 case AMDGPU::S_PACK_HL_B32_B16:
7149 case AMDGPU::S_PACK_HH_B32_B16:
7150 movePackToVALU(Worklist, MRI, Inst);
7151 Inst.eraseFromParent();
7152 return;
7153
7154 case AMDGPU::S_XNOR_B32:
7155 lowerScalarXnor(Worklist, Inst);
7156 Inst.eraseFromParent();
7157 return;
7158
7159 case AMDGPU::S_NAND_B32:
7160 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7161 Inst.eraseFromParent();
7162 return;
7163
7164 case AMDGPU::S_NOR_B32:
7165 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7166 Inst.eraseFromParent();
7167 return;
7168
7169 case AMDGPU::S_ANDN2_B32:
7170 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7171 Inst.eraseFromParent();
7172 return;
7173
7174 case AMDGPU::S_ORN2_B32:
7175 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7176 Inst.eraseFromParent();
7177 return;
7178
7179 // TODO: remove as soon as everything is ready
7180 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7181 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7182 // can only be selected from the uniform SDNode.
7183 case AMDGPU::S_ADD_CO_PSEUDO:
7184 case AMDGPU::S_SUB_CO_PSEUDO: {
7185 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7186 ? AMDGPU::V_ADDC_U32_e64
7187 : AMDGPU::V_SUBB_U32_e64;
7188 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7189
7190 Register CarryInReg = Inst.getOperand(4).getReg();
7191 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7192 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7193 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7194 .addReg(CarryInReg);
7195 }
7196
7197 Register CarryOutReg = Inst.getOperand(1).getReg();
7198
7199 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7200 MRI.getRegClass(Inst.getOperand(0).getReg())));
7201 MachineInstr *CarryOp =
7202 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7203 .addReg(CarryOutReg, RegState::Define)
7204 .add(Inst.getOperand(2))
7205 .add(Inst.getOperand(3))
7206 .addReg(CarryInReg)
7207 .addImm(0);
7208 legalizeOperands(*CarryOp);
7209 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7210 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7211 Inst.eraseFromParent();
7212 }
7213 return;
7214 case AMDGPU::S_UADDO_PSEUDO:
7215 case AMDGPU::S_USUBO_PSEUDO: {
7216 const DebugLoc &DL = Inst.getDebugLoc();
7217 MachineOperand &Dest0 = Inst.getOperand(0);
7218 MachineOperand &Dest1 = Inst.getOperand(1);
7219 MachineOperand &Src0 = Inst.getOperand(2);
7220 MachineOperand &Src1 = Inst.getOperand(3);
7221
7222 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7223 ? AMDGPU::V_ADD_CO_U32_e64
7224 : AMDGPU::V_SUB_CO_U32_e64;
7225 const TargetRegisterClass *NewRC =
7226 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7227 Register DestReg = MRI.createVirtualRegister(NewRC);
7228 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7229 .addReg(Dest1.getReg(), RegState::Define)
7230 .add(Src0)
7231 .add(Src1)
7232 .addImm(0); // clamp bit
7233
7234 legalizeOperands(*NewInstr, MDT);
7235 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7236 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7237 Worklist);
7238 Inst.eraseFromParent();
7239 }
7240 return;
7241
7242 case AMDGPU::S_CSELECT_B32:
7243 case AMDGPU::S_CSELECT_B64:
7244 lowerSelect(Worklist, Inst, MDT);
7245 Inst.eraseFromParent();
7246 return;
7247 case AMDGPU::S_CMP_EQ_I32:
7248 case AMDGPU::S_CMP_LG_I32:
7249 case AMDGPU::S_CMP_GT_I32:
7250 case AMDGPU::S_CMP_GE_I32:
7251 case AMDGPU::S_CMP_LT_I32:
7252 case AMDGPU::S_CMP_LE_I32:
7253 case AMDGPU::S_CMP_EQ_U32:
7254 case AMDGPU::S_CMP_LG_U32:
7255 case AMDGPU::S_CMP_GT_U32:
7256 case AMDGPU::S_CMP_GE_U32:
7257 case AMDGPU::S_CMP_LT_U32:
7258 case AMDGPU::S_CMP_LE_U32:
7259 case AMDGPU::S_CMP_EQ_U64:
7260 case AMDGPU::S_CMP_LG_U64:
7261 case AMDGPU::S_CMP_LT_F32:
7262 case AMDGPU::S_CMP_EQ_F32:
7263 case AMDGPU::S_CMP_LE_F32:
7264 case AMDGPU::S_CMP_GT_F32:
7265 case AMDGPU::S_CMP_LG_F32:
7266 case AMDGPU::S_CMP_GE_F32:
7267 case AMDGPU::S_CMP_O_F32:
7268 case AMDGPU::S_CMP_U_F32:
7269 case AMDGPU::S_CMP_NGE_F32:
7270 case AMDGPU::S_CMP_NLG_F32:
7271 case AMDGPU::S_CMP_NGT_F32:
7272 case AMDGPU::S_CMP_NLE_F32:
7273 case AMDGPU::S_CMP_NEQ_F32:
7274 case AMDGPU::S_CMP_NLT_F32:
7275 case AMDGPU::S_CMP_LT_F16:
7276 case AMDGPU::S_CMP_EQ_F16:
7277 case AMDGPU::S_CMP_LE_F16:
7278 case AMDGPU::S_CMP_GT_F16:
7279 case AMDGPU::S_CMP_LG_F16:
7280 case AMDGPU::S_CMP_GE_F16:
7281 case AMDGPU::S_CMP_O_F16:
7282 case AMDGPU::S_CMP_U_F16:
7283 case AMDGPU::S_CMP_NGE_F16:
7284 case AMDGPU::S_CMP_NLG_F16:
7285 case AMDGPU::S_CMP_NGT_F16:
7286 case AMDGPU::S_CMP_NLE_F16:
7287 case AMDGPU::S_CMP_NEQ_F16:
7288 case AMDGPU::S_CMP_NLT_F16: {
7289 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7290 auto NewInstr =
7291 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7292 .setMIFlags(Inst.getFlags());
7293 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7294 AMDGPU::OpName::src0_modifiers) >= 0) {
7295 NewInstr
7296 .addImm(0) // src0_modifiers
7297 .add(Inst.getOperand(0)) // src0
7298 .addImm(0) // src1_modifiers
7299 .add(Inst.getOperand(1)) // src1
7300 .addImm(0); // clamp
7301 } else {
7302 NewInstr
7303 .add(Inst.getOperand(0))
7304 .add(Inst.getOperand(1));
7305 }
7306 legalizeOperands(*NewInstr, MDT);
7307 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7308 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7309 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7310 Inst.eraseFromParent();
7311 return;
7312 }
7313 case AMDGPU::S_CVT_HI_F32_F16: {
7314 const DebugLoc &DL = Inst.getDebugLoc();
7315 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7316 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7317 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7318 .addImm(16)
7319 .add(Inst.getOperand(1));
7320 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7321 .addImm(0) // src0_modifiers
7322 .addReg(TmpReg)
7323 .addImm(0) // clamp
7324 .addImm(0); // omod
7325
7326 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7327 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7328 Inst.eraseFromParent();
7329 return;
7330 }
7331 case AMDGPU::S_MINIMUM_F32:
7332 case AMDGPU::S_MAXIMUM_F32:
7333 case AMDGPU::S_MINIMUM_F16:
7334 case AMDGPU::S_MAXIMUM_F16: {
7335 const DebugLoc &DL = Inst.getDebugLoc();
7336 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7337 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7338 .addImm(0) // src0_modifiers
7339 .add(Inst.getOperand(1))
7340 .addImm(0) // src1_modifiers
7341 .add(Inst.getOperand(2))
7342 .addImm(0) // clamp
7343 .addImm(0); // omod
7344 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7345
7346 legalizeOperands(*NewInstr, MDT);
7347 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7348 Inst.eraseFromParent();
7349 return;
7350 }
7351 }
7352
7353 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7354 // We cannot move this instruction to the VALU, so we should try to
7355 // legalize its operands instead.
7356 legalizeOperands(Inst, MDT);
7357 return;
7358 }
7359 // Handle converting generic instructions like COPY-to-SGPR into
7360 // COPY-to-VGPR.
7361 if (NewOpcode == Opcode) {
7362 Register DstReg = Inst.getOperand(0).getReg();
7363 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7364
7365 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7366 // hope for the best.
7367 if (Inst.isCopy() && DstReg.isPhysical() &&
7368 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7369 // TODO: Only works for 32 bit registers.
7370 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7371 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7372 .add(Inst.getOperand(1));
7373 Inst.eraseFromParent();
7374 return;
7375 }
7376
7377 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7378 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7379 // Instead of creating a copy where src and dst are the same register
7380 // class, we just replace all uses of dst with src. These kinds of
7381 // copies interfere with the heuristics MachineSink uses to decide
7382 // whether or not to split a critical edge. Since the pass assumes
7383 // that copies will end up as machine instructions and not be
7384 // eliminated.
7385 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7386 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7387 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7388 Inst.getOperand(0).setReg(DstReg);
7389 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7390 // these are deleted later, but at -O0 it would leave a suspicious
7391 // looking illegal copy of an undef register.
7392 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7393 Inst.removeOperand(I);
7394 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7395 return;
7396 }
7397 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7398 MRI.replaceRegWith(DstReg, NewDstReg);
7399 legalizeOperands(Inst, MDT);
7400 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7401 return;
7402 }
7403
7404 // Use the new VALU Opcode.
7405 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7406 .setMIFlags(Inst.getFlags());
7407 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7408 // Intersperse VOP3 modifiers among the SALU operands.
7409 NewInstr->addOperand(Inst.getOperand(0));
7410 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7411 AMDGPU::OpName::src0_modifiers) >= 0)
7412 NewInstr.addImm(0);
7413 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7414 MachineOperand Src = Inst.getOperand(1);
7415 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7416 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7417 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7418 else
7419 NewInstr->addOperand(Src);
7420 }
7421
7422 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7423 // We are converting these to a BFE, so we need to add the missing
7424 // operands for the size and offset.
7425 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7426 NewInstr.addImm(0);
7427 NewInstr.addImm(Size);
7428 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7429 // The VALU version adds the second operand to the result, so insert an
7430 // extra 0 operand.
7431 NewInstr.addImm(0);
7432 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7433 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7434 // If we need to move this to VGPRs, we need to unpack the second
7435 // operand back into the 2 separate ones for bit offset and width.
7436 assert(OffsetWidthOp.isImm() &&
7437 "Scalar BFE is only implemented for constant width and offset");
7438 uint32_t Imm = OffsetWidthOp.getImm();
7439
7440 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7441 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7442 NewInstr.addImm(Offset);
7443 NewInstr.addImm(BitWidth);
7444 } else {
7445 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7446 AMDGPU::OpName::src1_modifiers) >= 0)
7447 NewInstr.addImm(0);
7448 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7449 NewInstr->addOperand(Inst.getOperand(2));
7450 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7451 AMDGPU::OpName::src2_modifiers) >= 0)
7452 NewInstr.addImm(0);
7453 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7454 NewInstr->addOperand(Inst.getOperand(3));
7455 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7456 NewInstr.addImm(0);
7457 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7458 NewInstr.addImm(0);
7459 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7460 NewInstr.addImm(0);
7461 }
7462 } else {
7463 // Just copy the SALU operands.
7464 for (const MachineOperand &Op : Inst.explicit_operands())
7465 NewInstr->addOperand(Op);
7466 }
7467
7468 // Remove any references to SCC. Vector instructions can't read from it, and
7469 // We're just about to add the implicit use / defs of VCC, and we don't want
7470 // both.
7471 for (MachineOperand &Op : Inst.implicit_operands()) {
7472 if (Op.getReg() == AMDGPU::SCC) {
7473 // Only propagate through live-def of SCC.
7474 if (Op.isDef() && !Op.isDead())
7475 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7476 if (Op.isUse())
7477 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7478 }
7479 }
7480 Inst.eraseFromParent();
7481 Register NewDstReg;
7482 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7483 Register DstReg = NewInstr->getOperand(0).getReg();
7484 assert(DstReg.isVirtual());
7485 // Update the destination register class.
7486 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7487 assert(NewDstRC);
7488 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7489 MRI.replaceRegWith(DstReg, NewDstReg);
7490 }
7491 fixImplicitOperands(*NewInstr);
7492 // Legalize the operands
7493 legalizeOperands(*NewInstr, MDT);
7494 if (NewDstReg)
7495 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7496}
7497
7498// Add/sub require special handling to deal with carry outs.
7499std::pair<bool, MachineBasicBlock *>
7500SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7501 MachineDominatorTree *MDT) const {
7502 if (ST.hasAddNoCarry()) {
7503 // Assume there is no user of scc since we don't select this in that case.
7504 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7505 // is used.
7506
7507 MachineBasicBlock &MBB = *Inst.getParent();
7509
7510 Register OldDstReg = Inst.getOperand(0).getReg();
7511 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7512
7513 unsigned Opc = Inst.getOpcode();
7514 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7515
7516 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7517 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7518
7519 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7520 Inst.removeOperand(3);
7521
7522 Inst.setDesc(get(NewOpc));
7523 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7525 MRI.replaceRegWith(OldDstReg, ResultReg);
7526 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7527
7528 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7529 return std::pair(true, NewBB);
7530 }
7531
7532 return std::pair(false, nullptr);
7533}
7534
7535void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7536 MachineDominatorTree *MDT) const {
7537
7538 MachineBasicBlock &MBB = *Inst.getParent();
7540 MachineBasicBlock::iterator MII = Inst;
7541 DebugLoc DL = Inst.getDebugLoc();
7542
7543 MachineOperand &Dest = Inst.getOperand(0);
7544 MachineOperand &Src0 = Inst.getOperand(1);
7545 MachineOperand &Src1 = Inst.getOperand(2);
7546 MachineOperand &Cond = Inst.getOperand(3);
7547
7548 Register CondReg = Cond.getReg();
7549 bool IsSCC = (CondReg == AMDGPU::SCC);
7550
7551 // If this is a trivial select where the condition is effectively not SCC
7552 // (CondReg is a source of copy to SCC), then the select is semantically
7553 // equivalent to copying CondReg. Hence, there is no need to create
7554 // V_CNDMASK, we can just use that and bail out.
7555 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7556 (Src1.getImm() == 0)) {
7557 MRI.replaceRegWith(Dest.getReg(), CondReg);
7558 return;
7559 }
7560
7561 Register NewCondReg = CondReg;
7562 if (IsSCC) {
7563 const TargetRegisterClass *TC =
7564 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7565 NewCondReg = MRI.createVirtualRegister(TC);
7566
7567 // Now look for the closest SCC def if it is a copy
7568 // replacing the CondReg with the COPY source register
7569 bool CopyFound = false;
7570 for (MachineInstr &CandI :
7572 Inst.getParent()->rend())) {
7573 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
7574 -1) {
7575 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7576 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7577 .addReg(CandI.getOperand(1).getReg());
7578 CopyFound = true;
7579 }
7580 break;
7581 }
7582 }
7583 if (!CopyFound) {
7584 // SCC def is not a copy
7585 // Insert a trivial select instead of creating a copy, because a copy from
7586 // SCC would semantically mean just copying a single bit, but we may need
7587 // the result to be a vector condition mask that needs preserving.
7588 unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
7589 : AMDGPU::S_CSELECT_B32;
7590 auto NewSelect =
7591 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7592 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7593 }
7594 }
7595
7596 Register NewDestReg = MRI.createVirtualRegister(
7597 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7598 MachineInstr *NewInst;
7599 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7600 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7601 .addImm(0)
7602 .add(Src1) // False
7603 .addImm(0)
7604 .add(Src0) // True
7605 .addReg(NewCondReg);
7606 } else {
7607 NewInst =
7608 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7609 .add(Src1) // False
7610 .add(Src0) // True
7611 .addReg(NewCondReg);
7612 }
7613 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7614 legalizeOperands(*NewInst, MDT);
7615 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7616}
7617
7618void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7619 MachineInstr &Inst) const {
7620 MachineBasicBlock &MBB = *Inst.getParent();
7622 MachineBasicBlock::iterator MII = Inst;
7623 DebugLoc DL = Inst.getDebugLoc();
7624
7625 MachineOperand &Dest = Inst.getOperand(0);
7626 MachineOperand &Src = Inst.getOperand(1);
7627 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7628 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7629
7630 unsigned SubOp = ST.hasAddNoCarry() ?
7631 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7632
7633 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7634 .addImm(0)
7635 .addReg(Src.getReg());
7636
7637 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7638 .addReg(Src.getReg())
7639 .addReg(TmpReg);
7640
7641 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7642 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7643}
7644
7645void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7646 MachineInstr &Inst) const {
7647 MachineBasicBlock &MBB = *Inst.getParent();
7649 MachineBasicBlock::iterator MII = Inst;
7650 const DebugLoc &DL = Inst.getDebugLoc();
7651
7652 MachineOperand &Dest = Inst.getOperand(0);
7653 MachineOperand &Src0 = Inst.getOperand(1);
7654 MachineOperand &Src1 = Inst.getOperand(2);
7655
7656 if (ST.hasDLInsts()) {
7657 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7658 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7659 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7660
7661 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7662 .add(Src0)
7663 .add(Src1);
7664
7665 MRI.replaceRegWith(Dest.getReg(), NewDest);
7666 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7667 } else {
7668 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7669 // invert either source and then perform the XOR. If either source is a
7670 // scalar register, then we can leave the inversion on the scalar unit to
7671 // achieve a better distribution of scalar and vector instructions.
7672 bool Src0IsSGPR = Src0.isReg() &&
7673 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7674 bool Src1IsSGPR = Src1.isReg() &&
7675 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7677 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7678 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7679
7680 // Build a pair of scalar instructions and add them to the work list.
7681 // The next iteration over the work list will lower these to the vector
7682 // unit as necessary.
7683 if (Src0IsSGPR) {
7684 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7685 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7686 .addReg(Temp)
7687 .add(Src1);
7688 } else if (Src1IsSGPR) {
7689 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7690 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7691 .add(Src0)
7692 .addReg(Temp);
7693 } else {
7694 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7695 .add(Src0)
7696 .add(Src1);
7697 MachineInstr *Not =
7698 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7699 Worklist.insert(Not);
7700 }
7701
7702 MRI.replaceRegWith(Dest.getReg(), NewDest);
7703
7704 Worklist.insert(Xor);
7705
7706 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7707 }
7708}
7709
7710void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7711 MachineInstr &Inst,
7712 unsigned Opcode) const {
7713 MachineBasicBlock &MBB = *Inst.getParent();
7715 MachineBasicBlock::iterator MII = Inst;
7716 const DebugLoc &DL = Inst.getDebugLoc();
7717
7718 MachineOperand &Dest = Inst.getOperand(0);
7719 MachineOperand &Src0 = Inst.getOperand(1);
7720 MachineOperand &Src1 = Inst.getOperand(2);
7721
7722 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7723 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7724
7725 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7726 .add(Src0)
7727 .add(Src1);
7728
7729 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7730 .addReg(Interm);
7731
7732 Worklist.insert(&Op);
7733 Worklist.insert(&Not);
7734
7735 MRI.replaceRegWith(Dest.getReg(), NewDest);
7736 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7737}
7738
7739void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7740 MachineInstr &Inst,
7741 unsigned Opcode) const {
7742 MachineBasicBlock &MBB = *Inst.getParent();
7744 MachineBasicBlock::iterator MII = Inst;
7745 const DebugLoc &DL = Inst.getDebugLoc();
7746
7747 MachineOperand &Dest = Inst.getOperand(0);
7748 MachineOperand &Src0 = Inst.getOperand(1);
7749 MachineOperand &Src1 = Inst.getOperand(2);
7750
7751 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7752 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7753
7754 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7755 .add(Src1);
7756
7757 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7758 .add(Src0)
7759 .addReg(Interm);
7760
7761 Worklist.insert(&Not);
7762 Worklist.insert(&Op);
7763
7764 MRI.replaceRegWith(Dest.getReg(), NewDest);
7765 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7766}
7767
7768void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
7769 MachineInstr &Inst, unsigned Opcode,
7770 bool Swap) const {
7771 MachineBasicBlock &MBB = *Inst.getParent();
7773
7774 MachineOperand &Dest = Inst.getOperand(0);
7775 MachineOperand &Src0 = Inst.getOperand(1);
7776 DebugLoc DL = Inst.getDebugLoc();
7777
7778 MachineBasicBlock::iterator MII = Inst;
7779
7780 const MCInstrDesc &InstDesc = get(Opcode);
7781 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7782 MRI.getRegClass(Src0.getReg()) :
7783 &AMDGPU::SGPR_32RegClass;
7784
7785 const TargetRegisterClass *Src0SubRC =
7786 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7787
7788 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7789 AMDGPU::sub0, Src0SubRC);
7790
7791 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7792 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7793 const TargetRegisterClass *NewDestSubRC =
7794 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7795
7796 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7797 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
7798
7799 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7800 AMDGPU::sub1, Src0SubRC);
7801
7802 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
7803 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
7804
7805 if (Swap)
7806 std::swap(DestSub0, DestSub1);
7807
7808 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
7809 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7810 .addReg(DestSub0)
7811 .addImm(AMDGPU::sub0)
7812 .addReg(DestSub1)
7813 .addImm(AMDGPU::sub1);
7814
7815 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7816
7817 Worklist.insert(&LoHalf);
7818 Worklist.insert(&HiHalf);
7819
7820 // We don't need to legalizeOperands here because for a single operand, src0
7821 // will support any kind of input.
7822
7823 // Move all users of this moved value.
7824 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7825}
7826
7827// There is not a vector equivalent of s_mul_u64. For this reason, we need to
7828// split the s_mul_u64 in 32-bit vector multiplications.
7829void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
7830 MachineInstr &Inst,
7831 MachineDominatorTree *MDT) const {
7832 MachineBasicBlock &MBB = *Inst.getParent();
7834
7835 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7836 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7837 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7838
7839 MachineOperand &Dest = Inst.getOperand(0);
7840 MachineOperand &Src0 = Inst.getOperand(1);
7841 MachineOperand &Src1 = Inst.getOperand(2);
7842 const DebugLoc &DL = Inst.getDebugLoc();
7843 MachineBasicBlock::iterator MII = Inst;
7844
7845 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7846 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7847 const TargetRegisterClass *Src0SubRC =
7848 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7849 if (RI.isSGPRClass(Src0SubRC))
7850 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7851 const TargetRegisterClass *Src1SubRC =
7852 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7853 if (RI.isSGPRClass(Src1SubRC))
7854 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7855
7856 // First, we extract the low 32-bit and high 32-bit values from each of the
7857 // operands.
7858 MachineOperand Op0L =
7859 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7860 MachineOperand Op1L =
7861 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7862 MachineOperand Op0H =
7863 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
7864 MachineOperand Op1H =
7865 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
7866
7867 // The multilication is done as follows:
7868 //
7869 // Op1H Op1L
7870 // * Op0H Op0L
7871 // --------------------
7872 // Op1H*Op0L Op1L*Op0L
7873 // + Op1H*Op0H Op1L*Op0H
7874 // -----------------------------------------
7875 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
7876 //
7877 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
7878 // value and that would overflow.
7879 // The low 32-bit value is Op1L*Op0L.
7880 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
7881
7882 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7883 MachineInstr *Op1L_Op0H =
7884 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
7885 .add(Op1L)
7886 .add(Op0H);
7887
7888 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7889 MachineInstr *Op1H_Op0L =
7890 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
7891 .add(Op1H)
7892 .add(Op0L);
7893
7894 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7895 MachineInstr *Carry =
7896 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
7897 .add(Op1L)
7898 .add(Op0L);
7899
7900 MachineInstr *LoHalf =
7901 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7902 .add(Op1L)
7903 .add(Op0L);
7904
7905 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7906 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
7907 .addReg(Op1L_Op0H_Reg)
7908 .addReg(Op1H_Op0L_Reg);
7909
7910 MachineInstr *HiHalf =
7911 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
7912 .addReg(AddReg)
7913 .addReg(CarryReg);
7914
7915 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7916 .addReg(DestSub0)
7917 .addImm(AMDGPU::sub0)
7918 .addReg(DestSub1)
7919 .addImm(AMDGPU::sub1);
7920
7921 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7922
7923 // Try to legalize the operands in case we need to swap the order to keep it
7924 // valid.
7925 legalizeOperands(*Op1L_Op0H, MDT);
7926 legalizeOperands(*Op1H_Op0L, MDT);
7927 legalizeOperands(*Carry, MDT);
7928 legalizeOperands(*LoHalf, MDT);
7929 legalizeOperands(*Add, MDT);
7930 legalizeOperands(*HiHalf, MDT);
7931
7932 // Move all users of this moved value.
7933 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7934}
7935
7936// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
7937// multiplications.
7938void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
7939 MachineInstr &Inst,
7940 MachineDominatorTree *MDT) const {
7941 MachineBasicBlock &MBB = *Inst.getParent();
7943
7944 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7945 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7946 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7947
7948 MachineOperand &Dest = Inst.getOperand(0);
7949 MachineOperand &Src0 = Inst.getOperand(1);
7950 MachineOperand &Src1 = Inst.getOperand(2);
7951 const DebugLoc &DL = Inst.getDebugLoc();
7952 MachineBasicBlock::iterator MII = Inst;
7953
7954 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7955 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7956 const TargetRegisterClass *Src0SubRC =
7957 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7958 if (RI.isSGPRClass(Src0SubRC))
7959 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7960 const TargetRegisterClass *Src1SubRC =
7961 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7962 if (RI.isSGPRClass(Src1SubRC))
7963 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7964
7965 // First, we extract the low 32-bit and high 32-bit values from each of the
7966 // operands.
7967 MachineOperand Op0L =
7968 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7969 MachineOperand Op1L =
7970 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7971
7972 unsigned Opc = Inst.getOpcode();
7973 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
7974 ? AMDGPU::V_MUL_HI_U32_e64
7975 : AMDGPU::V_MUL_HI_I32_e64;
7976 MachineInstr *HiHalf =
7977 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
7978
7979 MachineInstr *LoHalf =
7980 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7981 .add(Op1L)
7982 .add(Op0L);
7983
7984 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7985 .addReg(DestSub0)
7986 .addImm(AMDGPU::sub0)
7987 .addReg(DestSub1)
7988 .addImm(AMDGPU::sub1);
7989
7990 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7991
7992 // Try to legalize the operands in case we need to swap the order to keep it
7993 // valid.
7994 legalizeOperands(*HiHalf, MDT);
7995 legalizeOperands(*LoHalf, MDT);
7996
7997 // Move all users of this moved value.
7998 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7999}
8000
8001void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8002 MachineInstr &Inst, unsigned Opcode,
8003 MachineDominatorTree *MDT) const {
8004 MachineBasicBlock &MBB = *Inst.getParent();
8006
8007 MachineOperand &Dest = Inst.getOperand(0);
8008 MachineOperand &Src0 = Inst.getOperand(1);
8009 MachineOperand &Src1 = Inst.getOperand(2);
8010 DebugLoc DL = Inst.getDebugLoc();
8011
8012 MachineBasicBlock::iterator MII = Inst;
8013
8014 const MCInstrDesc &InstDesc = get(Opcode);
8015 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8016 MRI.getRegClass(Src0.getReg()) :
8017 &AMDGPU::SGPR_32RegClass;
8018
8019 const TargetRegisterClass *Src0SubRC =
8020 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8021 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8022 MRI.getRegClass(Src1.getReg()) :
8023 &AMDGPU::SGPR_32RegClass;
8024
8025 const TargetRegisterClass *Src1SubRC =
8026 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8027
8028 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8029 AMDGPU::sub0, Src0SubRC);
8030 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8031 AMDGPU::sub0, Src1SubRC);
8032 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8033 AMDGPU::sub1, Src0SubRC);
8034 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8035 AMDGPU::sub1, Src1SubRC);
8036
8037 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8038 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8039 const TargetRegisterClass *NewDestSubRC =
8040 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8041
8042 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8043 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8044 .add(SrcReg0Sub0)
8045 .add(SrcReg1Sub0);
8046
8047 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8048 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8049 .add(SrcReg0Sub1)
8050 .add(SrcReg1Sub1);
8051
8052 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8053 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8054 .addReg(DestSub0)
8055 .addImm(AMDGPU::sub0)
8056 .addReg(DestSub1)
8057 .addImm(AMDGPU::sub1);
8058
8059 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8060
8061 Worklist.insert(&LoHalf);
8062 Worklist.insert(&HiHalf);
8063
8064 // Move all users of this moved value.
8065 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8066}
8067
8068void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8069 MachineInstr &Inst,
8070 MachineDominatorTree *MDT) const {
8071 MachineBasicBlock &MBB = *Inst.getParent();
8073
8074 MachineOperand &Dest = Inst.getOperand(0);
8075 MachineOperand &Src0 = Inst.getOperand(1);
8076 MachineOperand &Src1 = Inst.getOperand(2);
8077 const DebugLoc &DL = Inst.getDebugLoc();
8078
8079 MachineBasicBlock::iterator MII = Inst;
8080
8081 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8082
8083 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8084
8085 MachineOperand* Op0;
8086 MachineOperand* Op1;
8087
8088 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8089 Op0 = &Src0;
8090 Op1 = &Src1;
8091 } else {
8092 Op0 = &Src1;
8093 Op1 = &Src0;
8094 }
8095
8096 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8097 .add(*Op0);
8098
8099 Register NewDest = MRI.createVirtualRegister(DestRC);
8100
8101 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8102 .addReg(Interm)
8103 .add(*Op1);
8104
8105 MRI.replaceRegWith(Dest.getReg(), NewDest);
8106
8107 Worklist.insert(&Xor);
8108}
8109
8110void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8111 MachineInstr &Inst) const {
8112 MachineBasicBlock &MBB = *Inst.getParent();
8114
8115 MachineBasicBlock::iterator MII = Inst;
8116 const DebugLoc &DL = Inst.getDebugLoc();
8117
8118 MachineOperand &Dest = Inst.getOperand(0);
8119 MachineOperand &Src = Inst.getOperand(1);
8120
8121 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8122 const TargetRegisterClass *SrcRC = Src.isReg() ?
8123 MRI.getRegClass(Src.getReg()) :
8124 &AMDGPU::SGPR_32RegClass;
8125
8126 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8127 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8128
8129 const TargetRegisterClass *SrcSubRC =
8130 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8131
8132 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8133 AMDGPU::sub0, SrcSubRC);
8134 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8135 AMDGPU::sub1, SrcSubRC);
8136
8137 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8138
8139 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8140
8141 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8142
8143 // We don't need to legalize operands here. src0 for either instruction can be
8144 // an SGPR, and the second input is unused or determined here.
8145 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8146}
8147
8148void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8149 MachineInstr &Inst) const {
8150 MachineBasicBlock &MBB = *Inst.getParent();
8152 MachineBasicBlock::iterator MII = Inst;
8153 const DebugLoc &DL = Inst.getDebugLoc();
8154
8155 MachineOperand &Dest = Inst.getOperand(0);
8156 uint32_t Imm = Inst.getOperand(2).getImm();
8157 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8158 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8159
8160 (void) Offset;
8161
8162 // Only sext_inreg cases handled.
8163 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8164 Offset == 0 && "Not implemented");
8165
8166 if (BitWidth < 32) {
8167 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8168 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8169 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8170
8171 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8172 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8173 .addImm(0)
8174 .addImm(BitWidth);
8175
8176 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8177 .addImm(31)
8178 .addReg(MidRegLo);
8179
8180 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8181 .addReg(MidRegLo)
8182 .addImm(AMDGPU::sub0)
8183 .addReg(MidRegHi)
8184 .addImm(AMDGPU::sub1);
8185
8186 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8187 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8188 return;
8189 }
8190
8191 MachineOperand &Src = Inst.getOperand(1);
8192 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8193 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8194
8195 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8196 .addImm(31)
8197 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8198
8199 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8200 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8201 .addImm(AMDGPU::sub0)
8202 .addReg(TmpReg)
8203 .addImm(AMDGPU::sub1);
8204
8205 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8206 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8207}
8208
8209void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8210 MachineInstr &Inst, unsigned Opcode,
8211 MachineDominatorTree *MDT) const {
8212 // (S_FLBIT_I32_B64 hi:lo) ->
8213 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8214 // (S_FF1_I32_B64 hi:lo) ->
8215 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8216
8217 MachineBasicBlock &MBB = *Inst.getParent();
8219 MachineBasicBlock::iterator MII = Inst;
8220 const DebugLoc &DL = Inst.getDebugLoc();
8221
8222 MachineOperand &Dest = Inst.getOperand(0);
8223 MachineOperand &Src = Inst.getOperand(1);
8224
8225 const MCInstrDesc &InstDesc = get(Opcode);
8226
8227 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8228 unsigned OpcodeAdd =
8229 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8230
8231 const TargetRegisterClass *SrcRC =
8232 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8233 const TargetRegisterClass *SrcSubRC =
8234 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8235
8236 MachineOperand SrcRegSub0 =
8237 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8238 MachineOperand SrcRegSub1 =
8239 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8240
8241 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8242 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8243 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8244 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8245
8246 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8247
8248 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8249
8250 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8251 .addReg(IsCtlz ? MidReg1 : MidReg2)
8252 .addImm(32)
8253 .addImm(1); // enable clamp
8254
8255 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8256 .addReg(MidReg3)
8257 .addReg(IsCtlz ? MidReg2 : MidReg1);
8258
8259 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8260
8261 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8262}
8263
8264void SIInstrInfo::addUsersToMoveToVALUWorklist(
8266 SIInstrWorklist &Worklist) const {
8267 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8268 E = MRI.use_end(); I != E;) {
8269 MachineInstr &UseMI = *I->getParent();
8270
8271 unsigned OpNo = 0;
8272
8273 switch (UseMI.getOpcode()) {
8274 case AMDGPU::COPY:
8275 case AMDGPU::WQM:
8276 case AMDGPU::SOFT_WQM:
8277 case AMDGPU::STRICT_WWM:
8278 case AMDGPU::STRICT_WQM:
8279 case AMDGPU::REG_SEQUENCE:
8280 case AMDGPU::PHI:
8281 case AMDGPU::INSERT_SUBREG:
8282 break;
8283 default:
8284 OpNo = I.getOperandNo();
8285 break;
8286 }
8287
8288 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8289 Worklist.insert(&UseMI);
8290
8291 do {
8292 ++I;
8293 } while (I != E && I->getParent() == &UseMI);
8294 } else {
8295 ++I;
8296 }
8297 }
8298}
8299
8300void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8302 MachineInstr &Inst) const {
8303 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8305 MachineOperand &Src0 = Inst.getOperand(1);
8306 MachineOperand &Src1 = Inst.getOperand(2);
8307 const DebugLoc &DL = Inst.getDebugLoc();
8308
8309 switch (Inst.getOpcode()) {
8310 case AMDGPU::S_PACK_LL_B32_B16: {
8311 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8312 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8313
8314 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8315 // 0.
8316 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8317 .addImm(0xffff);
8318
8319 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8320 .addReg(ImmReg, RegState::Kill)
8321 .add(Src0);
8322
8323 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8324 .add(Src1)
8325 .addImm(16)
8326 .addReg(TmpReg, RegState::Kill);
8327 break;
8328 }
8329 case AMDGPU::S_PACK_LH_B32_B16: {
8330 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8331 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8332 .addImm(0xffff);
8333 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8334 .addReg(ImmReg, RegState::Kill)
8335 .add(Src0)
8336 .add(Src1);
8337 break;
8338 }
8339 case AMDGPU::S_PACK_HL_B32_B16: {
8340 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8341 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8342 .addImm(16)
8343 .add(Src0);
8344 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8345 .add(Src1)
8346 .addImm(16)
8347 .addReg(TmpReg, RegState::Kill);
8348 break;
8349 }
8350 case AMDGPU::S_PACK_HH_B32_B16: {
8351 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8352 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8353 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8354 .addImm(16)
8355 .add(Src0);
8356 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8357 .addImm(0xffff0000);
8358 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8359 .add(Src1)
8360 .addReg(ImmReg, RegState::Kill)
8361 .addReg(TmpReg, RegState::Kill);
8362 break;
8363 }
8364 default:
8365 llvm_unreachable("unhandled s_pack_* instruction");
8366 }
8367
8368 MachineOperand &Dest = Inst.getOperand(0);
8369 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8370 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8371}
8372
8373void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8374 MachineInstr &SCCDefInst,
8375 SIInstrWorklist &Worklist,
8376 Register NewCond) const {
8377
8378 // Ensure that def inst defines SCC, which is still live.
8379 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8380 !Op.isDead() && Op.getParent() == &SCCDefInst);
8381 SmallVector<MachineInstr *, 4> CopyToDelete;
8382 // This assumes that all the users of SCC are in the same block
8383 // as the SCC def.
8384 for (MachineInstr &MI : // Skip the def inst itself.
8385 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8386 SCCDefInst.getParent()->end())) {
8387 // Check if SCC is used first.
8388 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
8389 if (SCCIdx != -1) {
8390 if (MI.isCopy()) {
8391 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8392 Register DestReg = MI.getOperand(0).getReg();
8393
8394 MRI.replaceRegWith(DestReg, NewCond);
8395 CopyToDelete.push_back(&MI);
8396 } else {
8397
8398 if (NewCond.isValid())
8399 MI.getOperand(SCCIdx).setReg(NewCond);
8400
8401 Worklist.insert(&MI);
8402 }
8403 }
8404 // Exit if we find another SCC def.
8405 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8406 break;
8407 }
8408 for (auto &Copy : CopyToDelete)
8409 Copy->eraseFromParent();
8410}
8411
8412// Instructions that use SCC may be converted to VALU instructions. When that
8413// happens, the SCC register is changed to VCC_LO. The instruction that defines
8414// SCC must be changed to an instruction that defines VCC. This function makes
8415// sure that the instruction that defines SCC is added to the moveToVALU
8416// worklist.
8417void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8418 SIInstrWorklist &Worklist) const {
8419 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8420 // then there is nothing to do because the defining instruction has been
8421 // converted to a VALU already. If SCC then that instruction needs to be
8422 // converted to a VALU.
8423 for (MachineInstr &MI :
8424 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8425 SCCUseInst->getParent()->rend())) {
8426 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8427 break;
8428 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8429 Worklist.insert(&MI);
8430 break;
8431 }
8432 }
8433}
8434
8435const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8436 const MachineInstr &Inst) const {
8437 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8438
8439 switch (Inst.getOpcode()) {
8440 // For target instructions, getOpRegClass just returns the virtual register
8441 // class associated with the operand, so we need to find an equivalent VGPR
8442 // register class in order to move the instruction to the VALU.
8443 case AMDGPU::COPY:
8444 case AMDGPU::PHI:
8445 case AMDGPU::REG_SEQUENCE:
8446 case AMDGPU::INSERT_SUBREG:
8447 case AMDGPU::WQM:
8448 case AMDGPU::SOFT_WQM:
8449 case AMDGPU::STRICT_WWM:
8450 case AMDGPU::STRICT_WQM: {
8451 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8452 if (RI.isAGPRClass(SrcRC)) {
8453 if (RI.isAGPRClass(NewDstRC))
8454 return nullptr;
8455
8456 switch (Inst.getOpcode()) {
8457 case AMDGPU::PHI:
8458 case AMDGPU::REG_SEQUENCE:
8459 case AMDGPU::INSERT_SUBREG:
8460 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8461 break;
8462 default:
8463 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8464 }
8465
8466 if (!NewDstRC)
8467 return nullptr;
8468 } else {
8469 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8470 return nullptr;
8471
8472 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8473 if (!NewDstRC)
8474 return nullptr;
8475 }
8476
8477 return NewDstRC;
8478 }
8479 default:
8480 return NewDstRC;
8481 }
8482}
8483
8484// Find the one SGPR operand we are allowed to use.
8485Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8486 int OpIndices[3]) const {
8487 const MCInstrDesc &Desc = MI.getDesc();
8488
8489 // Find the one SGPR operand we are allowed to use.
8490 //
8491 // First we need to consider the instruction's operand requirements before
8492 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8493 // of VCC, but we are still bound by the constant bus requirement to only use
8494 // one.
8495 //
8496 // If the operand's class is an SGPR, we can never move it.
8497
8498 Register SGPRReg = findImplicitSGPRRead(MI);
8499 if (SGPRReg)
8500 return SGPRReg;
8501
8502 Register UsedSGPRs[3] = {Register()};
8503 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8504
8505 for (unsigned i = 0; i < 3; ++i) {
8506 int Idx = OpIndices[i];
8507 if (Idx == -1)
8508 break;
8509
8510 const MachineOperand &MO = MI.getOperand(Idx);
8511 if (!MO.isReg())
8512 continue;
8513
8514 // Is this operand statically required to be an SGPR based on the operand
8515 // constraints?
8516 const TargetRegisterClass *OpRC =
8517 RI.getRegClass(Desc.operands()[Idx].RegClass);
8518 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8519 if (IsRequiredSGPR)
8520 return MO.getReg();
8521
8522 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8523 Register Reg = MO.getReg();
8524 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8525 if (RI.isSGPRClass(RegRC))
8526 UsedSGPRs[i] = Reg;
8527 }
8528
8529 // We don't have a required SGPR operand, so we have a bit more freedom in
8530 // selecting operands to move.
8531
8532 // Try to select the most used SGPR. If an SGPR is equal to one of the
8533 // others, we choose that.
8534 //
8535 // e.g.
8536 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8537 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8538
8539 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8540 // prefer those.
8541
8542 if (UsedSGPRs[0]) {
8543 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8544 SGPRReg = UsedSGPRs[0];
8545 }
8546
8547 if (!SGPRReg && UsedSGPRs[1]) {
8548 if (UsedSGPRs[1] == UsedSGPRs[2])
8549 SGPRReg = UsedSGPRs[1];
8550 }
8551
8552 return SGPRReg;
8553}
8554
8556 unsigned OperandName) const {
8557 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8558 if (Idx == -1)
8559 return nullptr;
8560
8561 return &MI.getOperand(Idx);
8562}
8563
8569 return (Format << 44) |
8570 (1ULL << 56) | // RESOURCE_LEVEL = 1
8571 (3ULL << 60); // OOB_SELECT = 3
8572 }
8573
8574 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8575 if (ST.isAmdHsaOS()) {
8576 // Set ATC = 1. GFX9 doesn't have this bit.
8578 RsrcDataFormat |= (1ULL << 56);
8579
8580 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8581 // BTW, it disables TC L2 and therefore decreases performance.
8583 RsrcDataFormat |= (2ULL << 59);
8584 }
8585
8586 return RsrcDataFormat;
8587}
8588
8592 0xffffffff; // Size;
8593
8594 // GFX9 doesn't have ELEMENT_SIZE.
8596 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8597 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8598 }
8599
8600 // IndexStride = 64 / 32.
8601 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
8602 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8603
8604 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8605 // Clear them unless we want a huge stride.
8608 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8609
8610 return Rsrc23;
8611}
8612
8614 unsigned Opc = MI.getOpcode();
8615
8616 return isSMRD(Opc);
8617}
8618
8620 return get(Opc).mayLoad() &&
8621 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8622}
8623
8625 int &FrameIndex) const {
8626 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8627 if (!Addr || !Addr->isFI())
8628 return Register();
8629
8630 assert(!MI.memoperands_empty() &&
8631 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8632
8633 FrameIndex = Addr->getIndex();
8634 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8635}
8636
8638 int &FrameIndex) const {
8639 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8640 assert(Addr && Addr->isFI());
8641 FrameIndex = Addr->getIndex();
8642 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8643}
8644
8646 int &FrameIndex) const {
8647 if (!MI.mayLoad())
8648 return Register();
8649
8650 if (isMUBUF(MI) || isVGPRSpill(MI))
8651 return isStackAccess(MI, FrameIndex);
8652
8653 if (isSGPRSpill(MI))
8654 return isSGPRStackAccess(MI, FrameIndex);
8655
8656 return Register();
8657}
8658
8660 int &FrameIndex) const {
8661 if (!MI.mayStore())
8662 return Register();
8663
8664 if (isMUBUF(MI) || isVGPRSpill(MI))
8665 return isStackAccess(MI, FrameIndex);
8666
8667 if (isSGPRSpill(MI))
8668 return isSGPRStackAccess(MI, FrameIndex);
8669
8670 return Register();
8671}
8672
8674 unsigned Size = 0;
8676 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8677 while (++I != E && I->isInsideBundle()) {
8678 assert(!I->isBundle() && "No nested bundle!");
8680 }
8681
8682 return Size;
8683}
8684
8686 unsigned Opc = MI.getOpcode();
8688 unsigned DescSize = Desc.getSize();
8689
8690 // If we have a definitive size, we can use it. Otherwise we need to inspect
8691 // the operands to know the size.
8692 if (isFixedSize(MI)) {
8693 unsigned Size = DescSize;
8694
8695 // If we hit the buggy offset, an extra nop will be inserted in MC so
8696 // estimate the worst case.
8697 if (MI.isBranch() && ST.hasOffset3fBug())
8698 Size += 4;
8699
8700 return Size;
8701 }
8702
8703 // Instructions may have a 32-bit literal encoded after them. Check
8704 // operands that could ever be literals.
8705 if (isVALU(MI) || isSALU(MI)) {
8706 if (isDPP(MI))
8707 return DescSize;
8708 bool HasLiteral = false;
8709 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8710 const MachineOperand &Op = MI.getOperand(I);
8711 const MCOperandInfo &OpInfo = Desc.operands()[I];
8712 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8713 HasLiteral = true;
8714 break;
8715 }
8716 }
8717 return HasLiteral ? DescSize + 4 : DescSize;
8718 }
8719
8720 // Check whether we have extra NSA words.
8721 if (isMIMG(MI)) {
8722 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8723 if (VAddr0Idx < 0)
8724 return 8;
8725
8726 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8727 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8728 }
8729
8730 switch (Opc) {
8731 case TargetOpcode::BUNDLE:
8732 return getInstBundleSize(MI);
8733 case TargetOpcode::INLINEASM:
8734 case TargetOpcode::INLINEASM_BR: {
8735 const MachineFunction *MF = MI.getParent()->getParent();
8736 const char *AsmStr = MI.getOperand(0).getSymbolName();
8737 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8738 }
8739 default:
8740 if (MI.isMetaInstruction())
8741 return 0;
8742 return DescSize;
8743 }
8744}
8745
8747 if (!isFLAT(MI))
8748 return false;
8749
8750 if (MI.memoperands_empty())
8751 return true;
8752
8753 for (const MachineMemOperand *MMO : MI.memoperands()) {
8754 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8755 return true;
8756 }
8757 return false;
8758}
8759
8761 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
8762}
8763
8765 MachineBasicBlock *IfEnd) const {
8767 assert(TI != IfEntry->end());
8768
8769 MachineInstr *Branch = &(*TI);
8770 MachineFunction *MF = IfEntry->getParent();
8772
8773 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8774 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8775 MachineInstr *SIIF =
8776 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
8777 .add(Branch->getOperand(0))
8778 .add(Branch->getOperand(1));
8779 MachineInstr *SIEND =
8780 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
8781 .addReg(DstReg);
8782
8783 IfEntry->erase(TI);
8784 IfEntry->insert(IfEntry->end(), SIIF);
8785 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
8786 }
8787}
8788
8790 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
8792 // We expect 2 terminators, one conditional and one unconditional.
8793 assert(TI != LoopEnd->end());
8794
8795 MachineInstr *Branch = &(*TI);
8796 MachineFunction *MF = LoopEnd->getParent();
8798
8799 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8800
8801 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8802 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
8803 MachineInstrBuilder HeaderPHIBuilder =
8804 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
8805 for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) {
8806 if (PMBB == LoopEnd) {
8807 HeaderPHIBuilder.addReg(BackEdgeReg);
8808 } else {
8809 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
8810 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
8811 ZeroReg, 0);
8812 HeaderPHIBuilder.addReg(ZeroReg);
8813 }
8814 HeaderPHIBuilder.addMBB(PMBB);
8815 }
8816 MachineInstr *HeaderPhi = HeaderPHIBuilder;
8817 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
8818 get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
8819 .addReg(DstReg)
8820 .add(Branch->getOperand(0));
8821 MachineInstr *SILOOP =
8822 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
8823 .addReg(BackEdgeReg)
8824 .addMBB(LoopEntry);
8825
8826 LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
8827 LoopEnd->erase(TI);
8828 LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
8829 LoopEnd->insert(LoopEnd->end(), SILOOP);
8830 }
8831}
8832
8835 static const std::pair<int, const char *> TargetIndices[] = {
8836 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8837 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8838 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8839 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8840 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8841 return ArrayRef(TargetIndices);
8842}
8843
8844/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8845/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8848 const ScheduleDAG *DAG) const {
8849 return new GCNHazardRecognizer(DAG->MF);
8850}
8851
8852/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8853/// pass.
8856 return new GCNHazardRecognizer(MF);
8857}
8858
8859// Called during:
8860// - pre-RA scheduling and post-RA scheduling
8863 const ScheduleDAGMI *DAG) const {
8864 // Borrowed from Arm Target
8865 // We would like to restrict this hazard recognizer to only
8866 // post-RA scheduling; we can tell that we're post-RA because we don't
8867 // track VRegLiveness.
8868 if (!DAG->hasVRegLiveness())
8869 return new GCNHazardRecognizer(DAG->MF);
8871}
8872
8873std::pair<unsigned, unsigned>
8875 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
8876}
8877
8880 static const std::pair<unsigned, const char *> TargetFlags[] = {
8881 { MO_GOTPCREL, "amdgpu-gotprel" },
8882 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
8883 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
8884 { MO_REL32_LO, "amdgpu-rel32-lo" },
8885 { MO_REL32_HI, "amdgpu-rel32-hi" },
8886 { MO_ABS32_LO, "amdgpu-abs32-lo" },
8887 { MO_ABS32_HI, "amdgpu-abs32-hi" },
8888 };
8889
8890 return ArrayRef(TargetFlags);
8891}
8892
8895 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8896 {
8897 {MONoClobber, "amdgpu-noclobber"},
8898 {MOLastUse, "amdgpu-last-use"},
8899 };
8900
8901 return ArrayRef(TargetFlags);
8902}
8903
8905 const MachineFunction &MF) const {
8907 assert(SrcReg.isVirtual());
8908 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8909 return AMDGPU::WWM_COPY;
8910
8911 return AMDGPU::COPY;
8912}
8913
8915 Register Reg) const {
8916 // We need to handle instructions which may be inserted during register
8917 // allocation to handle the prolog. The initial prolog instruction may have
8918 // been separated from the start of the block by spills and copies inserted
8919 // needed by the prolog. However, the insertions for scalar registers can
8920 // always be placed at the BB top as they are independent of the exec mask
8921 // value.
8922 bool IsNullOrVectorRegister = true;
8923 if (Reg) {
8924 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8925 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
8926 }
8927
8928 uint16_t Opcode = MI.getOpcode();
8929 // FIXME: Copies inserted in the block prolog for live-range split should also
8930 // be included.
8931 return IsNullOrVectorRegister &&
8932 (isSpill(Opcode) || (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
8933 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
8934}
8935
8939 const DebugLoc &DL,
8940 Register DestReg) const {
8941 if (ST.hasAddNoCarry())
8942 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
8943
8945 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
8946 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
8947
8948 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8949 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8950}
8951
8954 const DebugLoc &DL,
8955 Register DestReg,
8956 RegScavenger &RS) const {
8957 if (ST.hasAddNoCarry())
8958 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
8959
8960 // If available, prefer to use vcc.
8961 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
8962 ? Register(RI.getVCC())
8964 *RI.getBoolRC(), I, /* RestoreAfter */ false,
8965 0, /* AllowSpill */ false);
8966
8967 // TODO: Users need to deal with this.
8968 if (!UnusedCarry.isValid())
8969 return MachineInstrBuilder();
8970
8971 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8972 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8973}
8974
8975bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
8976 switch (Opcode) {
8977 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
8978 case AMDGPU::SI_KILL_I1_TERMINATOR:
8979 return true;
8980 default:
8981 return false;
8982 }
8983}
8984
8986 switch (Opcode) {
8987 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
8988 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
8989 case AMDGPU::SI_KILL_I1_PSEUDO:
8990 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
8991 default:
8992 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
8993 }
8994}
8995
8996bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
8997 return Imm <= getMaxMUBUFImmOffset(ST);
8998}
8999
9001 // GFX12 field is non-negative 24-bit signed byte offset.
9002 const unsigned OffsetBits =
9003 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9004 return (1 << OffsetBits) - 1;
9005}
9006
9008 if (!ST.isWave32())
9009 return;
9010
9011 if (MI.isInlineAsm())
9012 return;
9013
9014 for (auto &Op : MI.implicit_operands()) {
9015 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9016 Op.setReg(AMDGPU::VCC_LO);
9017 }
9018}
9019
9021 if (!isSMRD(MI))
9022 return false;
9023
9024 // Check that it is using a buffer resource.
9025 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9026 if (Idx == -1) // e.g. s_memtime
9027 return false;
9028
9029 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9030 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9031}
9032
9033// Given Imm, split it into the values to put into the SOffset and ImmOffset
9034// fields in an MUBUF instruction. Return false if it is not possible (due to a
9035// hardware bug needing a workaround).
9036//
9037// The required alignment ensures that individual address components remain
9038// aligned if they are aligned to begin with. It also ensures that additional
9039// offsets within the given alignment can be added to the resulting ImmOffset.
9041 uint32_t &ImmOffset, Align Alignment) const {
9042 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9043 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9044 uint32_t Overflow = 0;
9045
9046 if (Imm > MaxImm) {
9047 if (Imm <= MaxImm + 64) {
9048 // Use an SOffset inline constant for 4..64
9049 Overflow = Imm - MaxImm;
9050 Imm = MaxImm;
9051 } else {
9052 // Try to keep the same value in SOffset for adjacent loads, so that
9053 // the corresponding register contents can be re-used.
9054 //
9055 // Load values with all low-bits (except for alignment bits) set into
9056 // SOffset, so that a larger range of values can be covered using
9057 // s_movk_i32.
9058 //
9059 // Atomic operations fail to work correctly when individual address
9060 // components are unaligned, even if their sum is aligned.
9061 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9062 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9063 Imm = Low;
9064 Overflow = High - Alignment.value();
9065 }
9066 }
9067
9068 if (Overflow > 0) {
9069 // There is a hardware bug in SI and CI which prevents address clamping in
9070 // MUBUF instructions from working correctly with SOffsets. The immediate
9071 // offset is unaffected.
9073 return false;
9074
9075 // It is not possible to set immediate in SOffset field on some targets.
9076 if (ST.hasRestrictedSOffset())
9077 return false;
9078 }
9079
9080 ImmOffset = Imm;
9081 SOffset = Overflow;
9082 return true;
9083}
9084
9085// Depending on the used address space and instructions, some immediate offsets
9086// are allowed and some are not.
9087// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9088// scratch instruction offsets can also be negative. On GFX12, offsets can be
9089// negative for all variants.
9090//
9091// There are several bugs related to these offsets:
9092// On gfx10.1, flat instructions that go into the global address space cannot
9093// use an offset.
9094//
9095// For scratch instructions, the address can be either an SGPR or a VGPR.
9096// The following offsets can be used, depending on the architecture (x means
9097// cannot be used):
9098// +----------------------------+------+------+
9099// | Address-Mode | SGPR | VGPR |
9100// +----------------------------+------+------+
9101// | gfx9 | | |
9102// | negative, 4-aligned offset | x | ok |
9103// | negative, unaligned offset | x | ok |
9104// +----------------------------+------+------+
9105// | gfx10 | | |
9106// | negative, 4-aligned offset | ok | ok |
9107// | negative, unaligned offset | ok | x |
9108// +----------------------------+------+------+
9109// | gfx10.3 | | |
9110// | negative, 4-aligned offset | ok | ok |
9111// | negative, unaligned offset | ok | ok |
9112// +----------------------------+------+------+
9113//
9114// This function ignores the addressing mode, so if an offset cannot be used in
9115// one addressing mode, it is considered illegal.
9116bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9117 uint64_t FlatVariant) const {
9118 // TODO: Should 0 be special cased?
9119 if (!ST.hasFlatInstOffsets())
9120 return false;
9121
9122 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9123 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9124 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9125 return false;
9126
9128 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9129 (Offset % 4) != 0) {
9130 return false;
9131 }
9132
9133 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9134 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9135 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9136}
9137
9138// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9139std::pair<int64_t, int64_t>
9140SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9141 uint64_t FlatVariant) const {
9142 int64_t RemainderOffset = COffsetVal;
9143 int64_t ImmField = 0;
9144
9145 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9146 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9147
9148 if (AllowNegative) {
9149 // Use signed division by a power of two to truncate towards 0.
9150 int64_t D = 1LL << NumBits;
9151 RemainderOffset = (COffsetVal / D) * D;
9152 ImmField = COffsetVal - RemainderOffset;
9153
9155 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9156 (ImmField % 4) != 0) {
9157 // Make ImmField a multiple of 4
9158 RemainderOffset += ImmField % 4;
9159 ImmField -= ImmField % 4;
9160 }
9161 } else if (COffsetVal >= 0) {
9162 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9163 RemainderOffset = COffsetVal - ImmField;
9164 }
9165
9166 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9167 assert(RemainderOffset + ImmField == COffsetVal);
9168 return {ImmField, RemainderOffset};
9169}
9170
9172 if (ST.hasNegativeScratchOffsetBug() &&
9173 FlatVariant == SIInstrFlags::FlatScratch)
9174 return false;
9175
9176 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9177}
9178
9179static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9180 switch (ST.getGeneration()) {
9181 default:
9182 break;
9185 return SIEncodingFamily::SI;
9188 return SIEncodingFamily::VI;
9195 }
9196 llvm_unreachable("Unknown subtarget generation!");
9197}
9198
9199bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9200 switch(MCOp) {
9201 // These opcodes use indirect register addressing so
9202 // they need special handling by codegen (currently missing).
9203 // Therefore it is too risky to allow these opcodes
9204 // to be selected by dpp combiner or sdwa peepholer.
9205 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9206 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9207 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9208 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9209 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9210 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9211 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9212 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9213 return true;
9214 default:
9215 return false;
9216 }
9217}
9218
9219int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9220 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9221
9222 unsigned Gen = subtargetEncodingFamily(ST);
9223
9224 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
9227
9228 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9229 // subtarget has UnpackedD16VMem feature.
9230 // TODO: remove this when we discard GFX80 encoding.
9231 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9233
9234 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9235 switch (ST.getGeneration()) {
9236 default:
9238 break;
9241 break;
9244 break;
9245 }
9246 }
9247
9248 if (isMAI(Opcode)) {
9249 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9250 if (MFMAOp != -1)
9251 Opcode = MFMAOp;
9252 }
9253
9254 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9255
9256 // -1 means that Opcode is already a native instruction.
9257 if (MCOp == -1)
9258 return Opcode;
9259
9260 if (ST.hasGFX90AInsts()) {
9261 uint16_t NMCOp = (uint16_t)-1;
9262 if (ST.hasGFX940Insts())
9264 if (NMCOp == (uint16_t)-1)
9266 if (NMCOp == (uint16_t)-1)
9268 if (NMCOp != (uint16_t)-1)
9269 MCOp = NMCOp;
9270 }
9271
9272 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9273 // no encoding in the given subtarget generation.
9274 if (MCOp == (uint16_t)-1)
9275 return -1;
9276
9277 if (isAsmOnlyOpcode(MCOp))
9278 return -1;
9279
9280 return MCOp;
9281}
9282
9283static
9285 assert(RegOpnd.isReg());
9286 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9287 getRegSubRegPair(RegOpnd);
9288}
9289
9292 assert(MI.isRegSequence());
9293 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9294 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9295 auto &RegOp = MI.getOperand(1 + 2 * I);
9296 return getRegOrUndef(RegOp);
9297 }
9299}
9300
9301// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9302// Following a subreg of reg:subreg isn't supported
9305 if (!RSR.SubReg)
9306 return false;
9307 switch (MI.getOpcode()) {
9308 default: break;
9309 case AMDGPU::REG_SEQUENCE:
9310 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9311 return true;
9312 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9313 case AMDGPU::INSERT_SUBREG:
9314 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9315 // inserted the subreg we're looking for
9316 RSR = getRegOrUndef(MI.getOperand(2));
9317 else { // the subreg in the rest of the reg
9318 auto R1 = getRegOrUndef(MI.getOperand(1));
9319 if (R1.SubReg) // subreg of subreg isn't supported
9320 return false;
9321 RSR.Reg = R1.Reg;
9322 }
9323 return true;
9324 }
9325 return false;
9326}
9327
9330 assert(MRI.isSSA());
9331 if (!P.Reg.isVirtual())
9332 return nullptr;
9333
9334 auto RSR = P;
9335 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9336 while (auto *MI = DefInst) {
9337 DefInst = nullptr;
9338 switch (MI->getOpcode()) {
9339 case AMDGPU::COPY:
9340 case AMDGPU::V_MOV_B32_e32: {
9341 auto &Op1 = MI->getOperand(1);
9342 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9343 if (Op1.isUndef())
9344 return nullptr;
9345 RSR = getRegSubRegPair(Op1);
9346 DefInst = MRI.getVRegDef(RSR.Reg);
9347 }
9348 break;
9349 }
9350 default:
9351 if (followSubRegDef(*MI, RSR)) {
9352 if (!RSR.Reg)
9353 return nullptr;
9354 DefInst = MRI.getVRegDef(RSR.Reg);
9355 }
9356 }
9357 if (!DefInst)
9358 return MI;
9359 }
9360 return nullptr;
9361}
9362
9364 Register VReg,
9365 const MachineInstr &DefMI,
9366 const MachineInstr &UseMI) {
9367 assert(MRI.isSSA() && "Must be run on SSA");
9368
9369 auto *TRI = MRI.getTargetRegisterInfo();
9370 auto *DefBB = DefMI.getParent();
9371
9372 // Don't bother searching between blocks, although it is possible this block
9373 // doesn't modify exec.
9374 if (UseMI.getParent() != DefBB)
9375 return true;
9376
9377 const int MaxInstScan = 20;
9378 int NumInst = 0;
9379
9380 // Stop scan at the use.
9381 auto E = UseMI.getIterator();
9382 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9383 if (I->isDebugInstr())
9384 continue;
9385
9386 if (++NumInst > MaxInstScan)
9387 return true;
9388
9389 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9390 return true;
9391 }
9392
9393 return false;
9394}
9395
9397 Register VReg,
9398 const MachineInstr &DefMI) {
9399 assert(MRI.isSSA() && "Must be run on SSA");
9400
9401 auto *TRI = MRI.getTargetRegisterInfo();
9402 auto *DefBB = DefMI.getParent();
9403
9404 const int MaxUseScan = 10;
9405 int NumUse = 0;
9406
9407 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9408 auto &UseInst = *Use.getParent();
9409 // Don't bother searching between blocks, although it is possible this block
9410 // doesn't modify exec.
9411 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9412 return true;
9413
9414 if (++NumUse > MaxUseScan)
9415 return true;
9416 }
9417
9418 if (NumUse == 0)
9419 return false;
9420
9421 const int MaxInstScan = 20;
9422 int NumInst = 0;
9423
9424 // Stop scan when we have seen all the uses.
9425 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9426 assert(I != DefBB->end());
9427
9428 if (I->isDebugInstr())
9429 continue;
9430
9431 if (++NumInst > MaxInstScan)
9432 return true;
9433
9434 for (const MachineOperand &Op : I->operands()) {
9435 // We don't check reg masks here as they're used only on calls:
9436 // 1. EXEC is only considered const within one BB
9437 // 2. Call should be a terminator instruction if present in a BB
9438
9439 if (!Op.isReg())
9440 continue;
9441
9442 Register Reg = Op.getReg();
9443 if (Op.isUse()) {
9444 if (Reg == VReg && --NumUse == 0)
9445 return false;
9446 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9447 return true;
9448 }
9449 }
9450}
9451
9454 const DebugLoc &DL, Register Src, Register Dst) const {
9455 auto Cur = MBB.begin();
9456 if (Cur != MBB.end())
9457 do {
9458 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9459 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9460 ++Cur;
9461 } while (Cur != MBB.end() && Cur != LastPHIIt);
9462
9463 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9464 Dst);
9465}
9466
9469 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9470 if (InsPt != MBB.end() &&
9471 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9472 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9473 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9474 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9475 InsPt++;
9476 return BuildMI(MBB, InsPt, DL,
9477 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9478 : AMDGPU::S_MOV_B64_term),
9479 Dst)
9480 .addReg(Src, 0, SrcSubReg)
9481 .addReg(AMDGPU::EXEC, RegState::Implicit);
9482 }
9483 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9484 Dst);
9485}
9486
9487bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9488
9491 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9492 VirtRegMap *VRM) const {
9493 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9494 //
9495 // %0:sreg_32 = COPY $m0
9496 //
9497 // We explicitly chose SReg_32 for the virtual register so such a copy might
9498 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9499 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9500 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9501 // TargetInstrInfo::foldMemoryOperand() is going to try.
9502 // A similar issue also exists with spilling and reloading $exec registers.
9503 //
9504 // To prevent that, constrain the %0 register class here.
9505 if (isFullCopyInstr(MI)) {
9506 Register DstReg = MI.getOperand(0).getReg();
9507 Register SrcReg = MI.getOperand(1).getReg();
9508 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9509 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9511 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9512 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9513 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9514 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9515 return nullptr;
9516 } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9517 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9518 return nullptr;
9519 }
9520 }
9521 }
9522
9523 return nullptr;
9524}
9525
9527 const MachineInstr &MI,
9528 unsigned *PredCost) const {
9529 if (MI.isBundle()) {
9531 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9532 unsigned Lat = 0, Count = 0;
9533 for (++I; I != E && I->isBundledWithPred(); ++I) {
9534 ++Count;
9535 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9536 }
9537 return Lat + Count - 1;
9538 }
9539
9540 return SchedModel.computeInstrLatency(&MI);
9541}
9542
9545 unsigned opcode = MI.getOpcode();
9546 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9547 auto IID = GI->getIntrinsicID();
9552
9553 switch (IID) {
9554 case Intrinsic::amdgcn_if:
9555 case Intrinsic::amdgcn_else:
9556 // FIXME: Uniform if second result
9557 break;
9558 }
9559
9561 }
9562
9563 // Loads from the private and flat address spaces are divergent, because
9564 // threads can execute the load instruction with the same inputs and get
9565 // different results.
9566 //
9567 // All other loads are not divergent, because if threads issue loads with the
9568 // same arguments, they will always get the same result.
9569 if (opcode == AMDGPU::G_LOAD) {
9570 if (MI.memoperands_empty())
9571 return InstructionUniformity::NeverUniform; // conservative assumption
9572
9573 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9574 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9575 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9576 })) {
9577 // At least one MMO in a non-global address space.
9579 }
9581 }
9582
9583 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9584 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9585 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9586 AMDGPU::isGenericAtomic(opcode)) {
9588 }
9590}
9591
9594
9595 if (isNeverUniform(MI))
9597
9598 unsigned opcode = MI.getOpcode();
9599 if (opcode == AMDGPU::V_READLANE_B32 ||
9600 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9601 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9603
9604 if (isCopyInstr(MI)) {
9605 const MachineOperand &srcOp = MI.getOperand(1);
9606 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9607 const TargetRegisterClass *regClass =
9608 RI.getPhysRegBaseClass(srcOp.getReg());
9611 }
9613 }
9614
9615 // GMIR handling
9616 if (MI.isPreISelOpcode())
9618
9619 // Atomics are divergent because they are executed sequentially: when an
9620 // atomic operation refers to the same address in each thread, then each
9621 // thread after the first sees the value written by the previous thread as
9622 // original value.
9623
9624 if (isAtomic(MI))
9626
9627 // Loads from the private and flat address spaces are divergent, because
9628 // threads can execute the load instruction with the same inputs and get
9629 // different results.
9630 if (isFLAT(MI) && MI.mayLoad()) {
9631 if (MI.memoperands_empty())
9632 return InstructionUniformity::NeverUniform; // conservative assumption
9633
9634 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9635 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9636 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9637 })) {
9638 // At least one MMO in a non-global address space.
9640 }
9641
9643 }
9644
9645 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9646 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9647
9648 // FIXME: It's conceptually broken to report this for an instruction, and not
9649 // a specific def operand. For inline asm in particular, there could be mixed
9650 // uniform and divergent results.
9651 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9652 const MachineOperand &SrcOp = MI.getOperand(I);
9653 if (!SrcOp.isReg())
9654 continue;
9655
9656 Register Reg = SrcOp.getReg();
9657 if (!Reg || !SrcOp.readsReg())
9658 continue;
9659
9660 // If RegBank is null, this is unassigned or an unallocatable special
9661 // register, which are all scalars.
9662 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9663 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9665 }
9666
9667 // TODO: Uniformity check condtions above can be rearranged for more
9668 // redability
9669
9670 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9671 // currently turned into no-op COPYs by SelectionDAG ISel and are
9672 // therefore no longer recognizable.
9673
9675}
9676
9678 switch (MF.getFunction().getCallingConv()) {
9680 return 1;
9682 return 2;
9684 return 3;
9688 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9691 case CallingConv::C:
9692 case CallingConv::Fast:
9693 default:
9694 // Assume other calling conventions are various compute callable functions
9695 return 0;
9696 }
9697}
9698
9700 Register &SrcReg2, int64_t &CmpMask,
9701 int64_t &CmpValue) const {
9702 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9703 return false;
9704
9705 switch (MI.getOpcode()) {
9706 default:
9707 break;
9708 case AMDGPU::S_CMP_EQ_U32:
9709 case AMDGPU::S_CMP_EQ_I32:
9710 case AMDGPU::S_CMP_LG_U32:
9711 case AMDGPU::S_CMP_LG_I32:
9712 case AMDGPU::S_CMP_LT_U32:
9713 case AMDGPU::S_CMP_LT_I32:
9714 case AMDGPU::S_CMP_GT_U32:
9715 case AMDGPU::S_CMP_GT_I32:
9716 case AMDGPU::S_CMP_LE_U32:
9717 case AMDGPU::S_CMP_LE_I32:
9718 case AMDGPU::S_CMP_GE_U32:
9719 case AMDGPU::S_CMP_GE_I32:
9720 case AMDGPU::S_CMP_EQ_U64:
9721 case AMDGPU::S_CMP_LG_U64:
9722 SrcReg = MI.getOperand(0).getReg();
9723 if (MI.getOperand(1).isReg()) {
9724 if (MI.getOperand(1).getSubReg())
9725 return false;
9726 SrcReg2 = MI.getOperand(1).getReg();
9727 CmpValue = 0;
9728 } else if (MI.getOperand(1).isImm()) {
9729 SrcReg2 = Register();
9730 CmpValue = MI.getOperand(1).getImm();
9731 } else {
9732 return false;
9733 }
9734 CmpMask = ~0;
9735 return true;
9736 case AMDGPU::S_CMPK_EQ_U32:
9737 case AMDGPU::S_CMPK_EQ_I32:
9738 case AMDGPU::S_CMPK_LG_U32:
9739 case AMDGPU::S_CMPK_LG_I32:
9740 case AMDGPU::S_CMPK_LT_U32:
9741 case AMDGPU::S_CMPK_LT_I32:
9742 case AMDGPU::S_CMPK_GT_U32:
9743 case AMDGPU::S_CMPK_GT_I32:
9744 case AMDGPU::S_CMPK_LE_U32:
9745 case AMDGPU::S_CMPK_LE_I32:
9746 case AMDGPU::S_CMPK_GE_U32:
9747 case AMDGPU::S_CMPK_GE_I32:
9748 SrcReg = MI.getOperand(0).getReg();
9749 SrcReg2 = Register();
9750 CmpValue = MI.getOperand(1).getImm();
9751 CmpMask = ~0;
9752 return true;
9753 }
9754
9755 return false;
9756}
9757
9759 Register SrcReg2, int64_t CmpMask,
9760 int64_t CmpValue,
9761 const MachineRegisterInfo *MRI) const {
9762 if (!SrcReg || SrcReg.isPhysical())
9763 return false;
9764
9765 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9766 return false;
9767
9768 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9769 this](int64_t ExpectedValue, unsigned SrcSize,
9770 bool IsReversible, bool IsSigned) -> bool {
9771 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9772 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9773 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9774 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9775 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9776 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9777 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9778 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9779 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9780 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9781 //
9782 // Signed ge/gt are not used for the sign bit.
9783 //
9784 // If result of the AND is unused except in the compare:
9785 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9786 //
9787 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9788 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9789 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9790 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9791 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9792 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9793
9794 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9795 if (!Def || Def->getParent() != CmpInstr.getParent())
9796 return false;
9797
9798 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9799 Def->getOpcode() != AMDGPU::S_AND_B64)
9800 return false;
9801
9802 int64_t Mask;
9803 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9804 if (MO->isImm())
9805 Mask = MO->getImm();
9806 else if (!getFoldableImm(MO, Mask))
9807 return false;
9808 Mask &= maxUIntN(SrcSize);
9809 return isPowerOf2_64(Mask);
9810 };
9811
9812 MachineOperand *SrcOp = &Def->getOperand(1);
9813 if (isMask(SrcOp))
9814 SrcOp = &Def->getOperand(2);
9815 else if (isMask(&Def->getOperand(2)))
9816 SrcOp = &Def->getOperand(1);
9817 else
9818 return false;
9819
9820 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
9821 if (IsSigned && BitNo == SrcSize - 1)
9822 return false;
9823
9824 ExpectedValue <<= BitNo;
9825
9826 bool IsReversedCC = false;
9827 if (CmpValue != ExpectedValue) {
9828 if (!IsReversible)
9829 return false;
9830 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
9831 if (!IsReversedCC)
9832 return false;
9833 }
9834
9835 Register DefReg = Def->getOperand(0).getReg();
9836 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
9837 return false;
9838
9839 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
9840 I != E; ++I) {
9841 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
9842 I->killsRegister(AMDGPU::SCC, &RI))
9843 return false;
9844 }
9845
9846 MachineOperand *SccDef =
9847 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
9848 SccDef->setIsDead(false);
9849 CmpInstr.eraseFromParent();
9850
9851 if (!MRI->use_nodbg_empty(DefReg)) {
9852 assert(!IsReversedCC);
9853 return true;
9854 }
9855
9856 // Replace AND with unused result with a S_BITCMP.
9857 MachineBasicBlock *MBB = Def->getParent();
9858
9859 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
9860 : AMDGPU::S_BITCMP1_B32
9861 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
9862 : AMDGPU::S_BITCMP1_B64;
9863
9864 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
9865 .add(*SrcOp)
9866 .addImm(BitNo);
9867 Def->eraseFromParent();
9868
9869 return true;
9870 };
9871
9872 switch (CmpInstr.getOpcode()) {
9873 default:
9874 break;
9875 case AMDGPU::S_CMP_EQ_U32:
9876 case AMDGPU::S_CMP_EQ_I32:
9877 case AMDGPU::S_CMPK_EQ_U32:
9878 case AMDGPU::S_CMPK_EQ_I32:
9879 return optimizeCmpAnd(1, 32, true, false);
9880 case AMDGPU::S_CMP_GE_U32:
9881 case AMDGPU::S_CMPK_GE_U32:
9882 return optimizeCmpAnd(1, 32, false, false);
9883 case AMDGPU::S_CMP_GE_I32:
9884 case AMDGPU::S_CMPK_GE_I32:
9885 return optimizeCmpAnd(1, 32, false, true);
9886 case AMDGPU::S_CMP_EQ_U64:
9887 return optimizeCmpAnd(1, 64, true, false);
9888 case AMDGPU::S_CMP_LG_U32:
9889 case AMDGPU::S_CMP_LG_I32:
9890 case AMDGPU::S_CMPK_LG_U32:
9891 case AMDGPU::S_CMPK_LG_I32:
9892 return optimizeCmpAnd(0, 32, true, false);
9893 case AMDGPU::S_CMP_GT_U32:
9894 case AMDGPU::S_CMPK_GT_U32:
9895 return optimizeCmpAnd(0, 32, false, false);
9896 case AMDGPU::S_CMP_GT_I32:
9897 case AMDGPU::S_CMPK_GT_I32:
9898 return optimizeCmpAnd(0, 32, false, true);
9899 case AMDGPU::S_CMP_LG_U64:
9900 return optimizeCmpAnd(0, 64, true, false);
9901 }
9902
9903 return false;
9904}
9905
9907 unsigned OpName) const {
9908 if (!ST.needsAlignedVGPRs())
9909 return;
9910
9911 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
9912 if (OpNo < 0)
9913 return;
9914 MachineOperand &Op = MI.getOperand(OpNo);
9915 if (getOpSize(MI, OpNo) > 4)
9916 return;
9917
9918 // Add implicit aligned super-reg to force alignment on the data operand.
9919 const DebugLoc &DL = MI.getDebugLoc();
9920 MachineBasicBlock *BB = MI.getParent();
9922 Register DataReg = Op.getReg();
9923 bool IsAGPR = RI.isAGPR(MRI, DataReg);
9924 Register Undef = MRI.createVirtualRegister(
9925 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
9926 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
9927 Register NewVR =
9928 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
9929 : &AMDGPU::VReg_64_Align2RegClass);
9930 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
9931 .addReg(DataReg, 0, Op.getSubReg())
9932 .addImm(AMDGPU::sub0)
9933 .addReg(Undef)
9934 .addImm(AMDGPU::sub1);
9935 Op.setReg(NewVR);
9936 Op.setSubReg(AMDGPU::sub0);
9937 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
9938}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
amdgpu AMDGPU Register Bank Select
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
TargetInstrInfo::RegSubRegPair RegSubRegPair
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillRestoreOpcode(unsigned Size)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillSaveOpcode(unsigned Size)
static bool resultDependsOnExec(const MachineInstr &MI)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:85
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static constexpr unsigned ModifierOpNames[]
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:76
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:77
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:806
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1521
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:744
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:748
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:979
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:387
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:623
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:764
bool hasMAIInsts() const
Definition: GCNSubtarget.h:814
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:274
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:294
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:760
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:679
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:752
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:340
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
Generation getGeneration() const
Definition: GCNSubtarget.h:313
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:904
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:731
bool hasAddr64() const
Definition: GCNSubtarget.h:377
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:723
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:541
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:611
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:621
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:194
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:444
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:438
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:248
unsigned short Opcode
Definition: MCInstrDesc.h:205
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:565
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:97
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:91
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:397
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:47
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineDomTreeNode * addNewBlock(MachineBasicBlock *BB, MachineBasicBlock *DomBB)
addNewBlock - Add a new node to the dominator tree information.
bool properlyDominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
void changeImmediateDominator(MachineBasicBlock *N, MachineBasicBlock *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:572
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
iterator_range< mop_iterator > explicit_operands()
Definition: MachineInstr.h:691
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:815
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:800
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:782
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:498
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:699
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:391
int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
bool addRegisterDead(Register Reg, const TargetRegisterInfo *RegInfo, bool AddIfNotFound=false)
We have determined MI defined a register without a use.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:792
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
bool isNonUniformBranchInstr(MachineInstr &Instr) const
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:504
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:924
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1105
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1233
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:408
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:496
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:604
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:520
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
bool isSpill(uint16_t Opcode) const
Definition: SIInstrInfo.h:740
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:424
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:464
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:947
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:760
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void materializeImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, int64_t Value) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:716
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:970
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description.
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
void convertNonUniformIfRegion(MachineBasicBlock *IfEntry, MachineBasicBlock *IfEnd) const
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
Whether we must prevent this instruction from executing with EXEC = 0.
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:681
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:863
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:728
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1246
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:880
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:562
unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:66
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:64
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:236
SlotIndexes pass.
Definition: SlotIndexes.h:296
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:519
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1524
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isDPALU_DPP(const MCInstrDesc &OpDesc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1525
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1527
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isHi(unsigned Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2INT32
Definition: SIDefines.h:227
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_AC_V2FP16
Definition: SIDefines.h:246
@ OPERAND_REG_IMM_INT32
Operands with register or 32-bit immediate.
Definition: SIDefines.h:200
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16_DEFERRED
Definition: SIDefines.h:207
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:206
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_AC_BF16
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_AC_INT16
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:204
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_AC_FP16
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_AC_INT32
Definition: SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:242
@ OPERAND_REG_INLINE_AC_V2BF16
Definition: SIDefines.h:245
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:213
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:203
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:251
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:243
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:202
@ OPERAND_REG_INLINE_C_V2FP32
Definition: SIDefines.h:228
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_FP32_DEFERRED
Definition: SIDefines.h:209
@ OPERAND_REG_IMM_FP16_DEFERRED
Definition: SIDefines.h:208
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:409
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:411
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:408
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:410
@ TI_CONSTDATA_START
Definition: AMDGPU.h:407
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1526
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:70
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:68
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:67
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:69
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:60
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:59
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:66
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:71
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1415
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:431
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:244
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Error
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:483
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:203
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:232
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:85
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:49
MachineInstr * top() const
Definition: SIInstrInfo.h:54
bool empty() const
Definition: SIInstrInfo.h:64
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:73
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.