LLVM 19.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm {
44namespace AMDGPU {
45#define GET_D16ImageDimIntrinsics_IMPL
46#define GET_ImageDimIntrinsicTable_IMPL
47#define GET_RsrcIntrinsics_IMPL
48#include "AMDGPUGenSearchableTables.inc"
49}
50}
51
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
85static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
86 unsigned Opc0 = N0->getMachineOpcode();
87 unsigned Opc1 = N1->getMachineOpcode();
88
89 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
90 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
91
92 if (Op0Idx == -1 && Op1Idx == -1)
93 return true;
94
95
96 if ((Op0Idx == -1 && Op1Idx != -1) ||
97 (Op1Idx == -1 && Op0Idx != -1))
98 return false;
99
100 // getNamedOperandIdx returns the index for the MachineInstr's operands,
101 // which includes the result as the first operand. We are indexing into the
102 // MachineSDNode's operands, so we need to skip the result operand to get
103 // the real index.
104 --Op0Idx;
105 --Op1Idx;
106
107 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
108}
109
110static bool canRemat(const MachineInstr &MI) {
111
115 return true;
116
117 if (SIInstrInfo::isSMRD(MI)) {
118 return !MI.memoperands_empty() &&
119 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
120 return MMO->isLoad() && MMO->isInvariant();
121 });
122 }
123
124 return false;
125}
126
128 const MachineInstr &MI) const {
129
130 if (canRemat(MI)) {
131 // Normally VALU use of exec would block the rematerialization, but that
132 // is OK in this case to have an implicit exec read as all VALU do.
133 // We really want all of the generic logic for this except for this.
134
135 // Another potential implicit use is mode register. The core logic of
136 // the RA will not attempt rematerialization if mode is set anywhere
137 // in the function, otherwise it is safe since mode is not changed.
138
139 // There is difference to generic method which does not allow
140 // rematerialization if there are virtual register uses. We allow this,
141 // therefore this method includes SOP instructions as well.
142 if (!MI.hasImplicitDef() &&
143 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
144 !MI.mayRaiseFPException())
145 return true;
146 }
147
149}
150
151// Returns true if the scalar result of a VALU instruction depends on exec.
153 // Ignore comparisons which are only used masked with exec.
154 // This allows some hoisting/sinking of VALU comparisons.
155 if (MI.isCompare()) {
156 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
157 Register DstReg = MI.getOperand(0).getReg();
158 if (!DstReg.isVirtual())
159 return true;
160 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
161 switch (Use.getOpcode()) {
162 case AMDGPU::S_AND_SAVEEXEC_B32:
163 case AMDGPU::S_AND_SAVEEXEC_B64:
164 break;
165 case AMDGPU::S_AND_B32:
166 case AMDGPU::S_AND_B64:
167 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
168 return true;
169 break;
170 default:
171 return true;
172 }
173 }
174 return false;
175 }
176
177 switch (MI.getOpcode()) {
178 default:
179 break;
180 case AMDGPU::V_READFIRSTLANE_B32:
181 return true;
182 }
183
184 return false;
185}
186
188 // Any implicit use of exec by VALU is not a real register read.
189 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
191}
192
194 MachineBasicBlock *SuccToSinkTo,
195 MachineCycleInfo *CI) const {
196 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
197 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
198 return true;
199
200 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
201 // Check if sinking of MI would create temporal divergent use.
202 for (auto Op : MI.uses()) {
203 if (Op.isReg() && Op.getReg().isVirtual() &&
204 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
205 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
206
207 // SgprDef defined inside cycle
208 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
209 if (FromCycle == nullptr)
210 continue;
211
212 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
213 // Check if there is a FromCycle that contains SgprDef's basic block but
214 // does not contain SuccToSinkTo and also has divergent exit condition.
215 while (FromCycle && !FromCycle->contains(ToCycle)) {
216 // After structurize-cfg, there should be exactly one cycle exit.
218 FromCycle->getExitBlocks(ExitBlocks);
219 assert(ExitBlocks.size() == 1);
220 assert(ExitBlocks[0]->getSinglePredecessor());
221
222 // FromCycle has divergent exit condition.
223 if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) {
224 return false;
225 }
226
227 FromCycle = FromCycle->getParentCycle();
228 }
229 }
230 }
231
232 return true;
233}
234
236 int64_t &Offset0,
237 int64_t &Offset1) const {
238 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
239 return false;
240
241 unsigned Opc0 = Load0->getMachineOpcode();
242 unsigned Opc1 = Load1->getMachineOpcode();
243
244 // Make sure both are actually loads.
245 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
246 return false;
247
248 // A mayLoad instruction without a def is not a load. Likely a prefetch.
249 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
250 return false;
251
252 if (isDS(Opc0) && isDS(Opc1)) {
253
254 // FIXME: Handle this case:
255 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
256 return false;
257
258 // Check base reg.
259 if (Load0->getOperand(0) != Load1->getOperand(0))
260 return false;
261
262 // Skip read2 / write2 variants for simplicity.
263 // TODO: We should report true if the used offsets are adjacent (excluded
264 // st64 versions).
265 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
266 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
267 if (Offset0Idx == -1 || Offset1Idx == -1)
268 return false;
269
270 // XXX - be careful of dataless loads
271 // getNamedOperandIdx returns the index for MachineInstrs. Since they
272 // include the output in the operand list, but SDNodes don't, we need to
273 // subtract the index by one.
274 Offset0Idx -= get(Opc0).NumDefs;
275 Offset1Idx -= get(Opc1).NumDefs;
276 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
277 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
278 return true;
279 }
280
281 if (isSMRD(Opc0) && isSMRD(Opc1)) {
282 // Skip time and cache invalidation instructions.
283 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
284 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
285 return false;
286
287 unsigned NumOps = getNumOperandsNoGlue(Load0);
288 if (NumOps != getNumOperandsNoGlue(Load1))
289 return false;
290
291 // Check base reg.
292 if (Load0->getOperand(0) != Load1->getOperand(0))
293 return false;
294
295 // Match register offsets, if both register and immediate offsets present.
296 assert(NumOps == 4 || NumOps == 5);
297 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
298 return false;
299
300 const ConstantSDNode *Load0Offset =
301 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
302 const ConstantSDNode *Load1Offset =
303 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
304
305 if (!Load0Offset || !Load1Offset)
306 return false;
307
308 Offset0 = Load0Offset->getZExtValue();
309 Offset1 = Load1Offset->getZExtValue();
310 return true;
311 }
312
313 // MUBUF and MTBUF can access the same addresses.
314 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
315
316 // MUBUF and MTBUF have vaddr at different indices.
317 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
318 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
319 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
320 return false;
321
322 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
323 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
324
325 if (OffIdx0 == -1 || OffIdx1 == -1)
326 return false;
327
328 // getNamedOperandIdx returns the index for MachineInstrs. Since they
329 // include the output in the operand list, but SDNodes don't, we need to
330 // subtract the index by one.
331 OffIdx0 -= get(Opc0).NumDefs;
332 OffIdx1 -= get(Opc1).NumDefs;
333
334 SDValue Off0 = Load0->getOperand(OffIdx0);
335 SDValue Off1 = Load1->getOperand(OffIdx1);
336
337 // The offset might be a FrameIndexSDNode.
338 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
339 return false;
340
341 Offset0 = Off0->getAsZExtVal();
342 Offset1 = Off1->getAsZExtVal();
343 return true;
344 }
345
346 return false;
347}
348
349static bool isStride64(unsigned Opc) {
350 switch (Opc) {
351 case AMDGPU::DS_READ2ST64_B32:
352 case AMDGPU::DS_READ2ST64_B64:
353 case AMDGPU::DS_WRITE2ST64_B32:
354 case AMDGPU::DS_WRITE2ST64_B64:
355 return true;
356 default:
357 return false;
358 }
359}
360
363 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
364 const TargetRegisterInfo *TRI) const {
365 if (!LdSt.mayLoadOrStore())
366 return false;
367
368 unsigned Opc = LdSt.getOpcode();
369 OffsetIsScalable = false;
370 const MachineOperand *BaseOp, *OffsetOp;
371 int DataOpIdx;
372
373 if (isDS(LdSt)) {
374 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
375 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
376 if (OffsetOp) {
377 // Normal, single offset LDS instruction.
378 if (!BaseOp) {
379 // DS_CONSUME/DS_APPEND use M0 for the base address.
380 // TODO: find the implicit use operand for M0 and use that as BaseOp?
381 return false;
382 }
383 BaseOps.push_back(BaseOp);
384 Offset = OffsetOp->getImm();
385 // Get appropriate operand, and compute width accordingly.
386 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
387 if (DataOpIdx == -1)
388 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
389 Width = getOpSize(LdSt, DataOpIdx);
390 } else {
391 // The 2 offset instructions use offset0 and offset1 instead. We can treat
392 // these as a load with a single offset if the 2 offsets are consecutive.
393 // We will use this for some partially aligned loads.
394 const MachineOperand *Offset0Op =
395 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
396 const MachineOperand *Offset1Op =
397 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
398
399 unsigned Offset0 = Offset0Op->getImm() & 0xff;
400 unsigned Offset1 = Offset1Op->getImm() & 0xff;
401 if (Offset0 + 1 != Offset1)
402 return false;
403
404 // Each of these offsets is in element sized units, so we need to convert
405 // to bytes of the individual reads.
406
407 unsigned EltSize;
408 if (LdSt.mayLoad())
409 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
410 else {
411 assert(LdSt.mayStore());
412 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
413 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
414 }
415
416 if (isStride64(Opc))
417 EltSize *= 64;
418
419 BaseOps.push_back(BaseOp);
420 Offset = EltSize * Offset0;
421 // Get appropriate operand(s), and compute width accordingly.
422 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
423 if (DataOpIdx == -1) {
424 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
425 Width = getOpSize(LdSt, DataOpIdx);
426 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
427 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
428 } else {
429 Width = getOpSize(LdSt, DataOpIdx);
430 }
431 }
432 return true;
433 }
434
435 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
436 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
437 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
438 return false;
439 BaseOps.push_back(RSrc);
440 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
441 if (BaseOp && !BaseOp->isFI())
442 BaseOps.push_back(BaseOp);
443 const MachineOperand *OffsetImm =
444 getNamedOperand(LdSt, AMDGPU::OpName::offset);
445 Offset = OffsetImm->getImm();
446 const MachineOperand *SOffset =
447 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
448 if (SOffset) {
449 if (SOffset->isReg())
450 BaseOps.push_back(SOffset);
451 else
452 Offset += SOffset->getImm();
453 }
454 // Get appropriate operand, and compute width accordingly.
455 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
456 if (DataOpIdx == -1)
457 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
458 if (DataOpIdx == -1) // LDS DMA
459 return false;
460 Width = getOpSize(LdSt, DataOpIdx);
461 return true;
462 }
463
464 if (isImage(LdSt)) {
465 auto RsrcOpName =
466 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
467 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
468 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
469 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
470 if (VAddr0Idx >= 0) {
471 // GFX10 possible NSA encoding.
472 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
473 BaseOps.push_back(&LdSt.getOperand(I));
474 } else {
475 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
476 }
477 Offset = 0;
478 // Get appropriate operand, and compute width accordingly.
479 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
480 Width = getOpSize(LdSt, DataOpIdx);
481 return true;
482 }
483
484 if (isSMRD(LdSt)) {
485 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
486 if (!BaseOp) // e.g. S_MEMTIME
487 return false;
488 BaseOps.push_back(BaseOp);
489 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
490 Offset = OffsetOp ? OffsetOp->getImm() : 0;
491 // Get appropriate operand, and compute width accordingly.
492 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
493 if (DataOpIdx == -1)
494 return false;
495 Width = getOpSize(LdSt, DataOpIdx);
496 return true;
497 }
498
499 if (isFLAT(LdSt)) {
500 // Instructions have either vaddr or saddr or both or none.
501 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
502 if (BaseOp)
503 BaseOps.push_back(BaseOp);
504 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
505 if (BaseOp)
506 BaseOps.push_back(BaseOp);
507 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
508 // Get appropriate operand, and compute width accordingly.
509 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
510 if (DataOpIdx == -1)
511 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
512 if (DataOpIdx == -1) // LDS DMA
513 return false;
514 Width = getOpSize(LdSt, DataOpIdx);
515 return true;
516 }
517
518 return false;
519}
520
521static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
523 const MachineInstr &MI2,
525 // Only examine the first "base" operand of each instruction, on the
526 // assumption that it represents the real base address of the memory access.
527 // Other operands are typically offsets or indices from this base address.
528 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
529 return true;
530
531 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
532 return false;
533
534 auto MO1 = *MI1.memoperands_begin();
535 auto MO2 = *MI2.memoperands_begin();
536 if (MO1->getAddrSpace() != MO2->getAddrSpace())
537 return false;
538
539 auto Base1 = MO1->getValue();
540 auto Base2 = MO2->getValue();
541 if (!Base1 || !Base2)
542 return false;
543 Base1 = getUnderlyingObject(Base1);
544 Base2 = getUnderlyingObject(Base2);
545
546 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
547 return false;
548
549 return Base1 == Base2;
550}
551
553 int64_t Offset1, bool OffsetIsScalable1,
555 int64_t Offset2, bool OffsetIsScalable2,
556 unsigned ClusterSize,
557 unsigned NumBytes) const {
558 // If the mem ops (to be clustered) do not have the same base ptr, then they
559 // should not be clustered
560 if (!BaseOps1.empty() && !BaseOps2.empty()) {
561 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
562 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
563 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
564 return false;
565 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
566 // If only one base op is empty, they do not have the same base ptr
567 return false;
568 }
569
570 // In order to avoid register pressure, on an average, the number of DWORDS
571 // loaded together by all clustered mem ops should not exceed 8. This is an
572 // empirical value based on certain observations and performance related
573 // experiments.
574 // The good thing about this heuristic is - it avoids clustering of too many
575 // sub-word loads, and also avoids clustering of wide loads. Below is the
576 // brief summary of how the heuristic behaves for various `LoadSize`.
577 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
578 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
579 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
580 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
581 // (5) LoadSize >= 17: do not cluster
582 const unsigned LoadSize = NumBytes / ClusterSize;
583 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
584 return NumDWORDs <= 8;
585}
586
587// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
588// the first 16 loads will be interleaved with the stores, and the next 16 will
589// be clustered as expected. It should really split into 2 16 store batches.
590//
591// Loads are clustered until this returns false, rather than trying to schedule
592// groups of stores. This also means we have to deal with saying different
593// address space loads should be clustered, and ones which might cause bank
594// conflicts.
595//
596// This might be deprecated so it might not be worth that much effort to fix.
598 int64_t Offset0, int64_t Offset1,
599 unsigned NumLoads) const {
600 assert(Offset1 > Offset0 &&
601 "Second offset should be larger than first offset!");
602 // If we have less than 16 loads in a row, and the offsets are within 64
603 // bytes, then schedule together.
604
605 // A cacheline is 64 bytes (for global memory).
606 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
607}
608
611 const DebugLoc &DL, MCRegister DestReg,
612 MCRegister SrcReg, bool KillSrc,
613 const char *Msg = "illegal VGPR to SGPR copy") {
615 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
617 C.diagnose(IllegalCopy);
618
619 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
620 .addReg(SrcReg, getKillRegState(KillSrc));
621}
622
623/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
624/// possible to have a direct copy in these cases on GFX908, so an intermediate
625/// VGPR copy is required.
629 const DebugLoc &DL, MCRegister DestReg,
630 MCRegister SrcReg, bool KillSrc,
631 RegScavenger &RS, bool RegsOverlap,
632 Register ImpDefSuperReg = Register(),
633 Register ImpUseSuperReg = Register()) {
634 assert((TII.getSubtarget().hasMAIInsts() &&
635 !TII.getSubtarget().hasGFX90AInsts()) &&
636 "Expected GFX908 subtarget.");
637
638 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
639 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
640 "Source register of the copy should be either an SGPR or an AGPR.");
641
642 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
643 "Destination register of the copy should be an AGPR.");
644
645 const SIRegisterInfo &RI = TII.getRegisterInfo();
646
647 // First try to find defining accvgpr_write to avoid temporary registers.
648 // In the case of copies of overlapping AGPRs, we conservatively do not
649 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
650 // an accvgpr_write used for this same copy due to implicit-defs
651 if (!RegsOverlap) {
652 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
653 --Def;
654
655 if (!Def->modifiesRegister(SrcReg, &RI))
656 continue;
657
658 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
659 Def->getOperand(0).getReg() != SrcReg)
660 break;
661
662 MachineOperand &DefOp = Def->getOperand(1);
663 assert(DefOp.isReg() || DefOp.isImm());
664
665 if (DefOp.isReg()) {
666 bool SafeToPropagate = true;
667 // Check that register source operand is not clobbered before MI.
668 // Immediate operands are always safe to propagate.
669 for (auto I = Def; I != MI && SafeToPropagate; ++I)
670 if (I->modifiesRegister(DefOp.getReg(), &RI))
671 SafeToPropagate = false;
672
673 if (!SafeToPropagate)
674 break;
675
676 DefOp.setIsKill(false);
677 }
678
679 MachineInstrBuilder Builder =
680 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
681 .add(DefOp);
682 if (ImpDefSuperReg)
683 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
684
685 if (ImpUseSuperReg) {
686 Builder.addReg(ImpUseSuperReg,
688 }
689
690 return;
691 }
692 }
693
695 RS.backward(std::next(MI));
696
697 // Ideally we want to have three registers for a long reg_sequence copy
698 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
699 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
700 *MBB.getParent());
701
702 // Registers in the sequence are allocated contiguously so we can just
703 // use register number to pick one of three round-robin temps.
704 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
705 Register Tmp =
706 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
708 "VGPR used for an intermediate copy should have been reserved.");
709
710 // Only loop through if there are any free registers left. We don't want to
711 // spill.
712 while (RegNo--) {
713 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
714 /* RestoreAfter */ false, 0,
715 /* AllowSpill */ false);
716 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
717 break;
718 Tmp = Tmp2;
719 RS.setRegUsed(Tmp);
720 }
721
722 // Insert copy to temporary VGPR.
723 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
724 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
725 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
726 } else {
727 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
728 }
729
730 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
731 .addReg(SrcReg, getKillRegState(KillSrc));
732 if (ImpUseSuperReg) {
733 UseBuilder.addReg(ImpUseSuperReg,
735 }
736
737 MachineInstrBuilder DefBuilder
738 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
739 .addReg(Tmp, RegState::Kill);
740
741 if (ImpDefSuperReg)
742 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
743}
744
747 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
748 const TargetRegisterClass *RC, bool Forward) {
749 const SIRegisterInfo &RI = TII.getRegisterInfo();
750 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
752 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
753
754 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
755 int16_t SubIdx = BaseIndices[Idx];
756 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
757 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
758 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
759 unsigned Opcode = AMDGPU::S_MOV_B32;
760
761 // Is SGPR aligned? If so try to combine with next.
762 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
763 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
764 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
765 // Can use SGPR64 copy
766 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
767 SubIdx = RI.getSubRegFromChannel(Channel, 2);
768 DestSubReg = RI.getSubReg(DestReg, SubIdx);
769 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
770 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
771 Opcode = AMDGPU::S_MOV_B64;
772 Idx++;
773 }
774
775 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
776 .addReg(SrcSubReg)
777 .addReg(SrcReg, RegState::Implicit);
778
779 if (!FirstMI)
780 FirstMI = LastMI;
781
782 if (!Forward)
783 I--;
784 }
785
786 assert(FirstMI && LastMI);
787 if (!Forward)
788 std::swap(FirstMI, LastMI);
789
790 FirstMI->addOperand(
791 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
792
793 if (KillSrc)
794 LastMI->addRegisterKilled(SrcReg, &RI);
795}
796
799 const DebugLoc &DL, MCRegister DestReg,
800 MCRegister SrcReg, bool KillSrc) const {
801 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
802 unsigned Size = RI.getRegSizeInBits(*RC);
803 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
804 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
805
806 // The rest of copyPhysReg assumes Src and Dst size are the same size.
807 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
808 // we remove Fix16BitCopies and this code block?
809 if (Fix16BitCopies) {
810 if (((Size == 16) != (SrcSize == 16))) {
811 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
813 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
814 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
815 RegToFix = SubReg;
816
817 if (DestReg == SrcReg) {
818 // Identity copy. Insert empty bundle since ExpandPostRA expects an
819 // instruction here.
820 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
821 return;
822 }
823 RC = RI.getPhysRegBaseClass(DestReg);
824 Size = RI.getRegSizeInBits(*RC);
825 SrcRC = RI.getPhysRegBaseClass(SrcReg);
826 SrcSize = RI.getRegSizeInBits(*SrcRC);
827 }
828 }
829
830 if (RC == &AMDGPU::VGPR_32RegClass) {
831 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
832 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
833 AMDGPU::AGPR_32RegClass.contains(SrcReg));
834 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
835 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
836 BuildMI(MBB, MI, DL, get(Opc), DestReg)
837 .addReg(SrcReg, getKillRegState(KillSrc));
838 return;
839 }
840
841 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
842 RC == &AMDGPU::SReg_32RegClass) {
843 if (SrcReg == AMDGPU::SCC) {
844 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
845 .addImm(1)
846 .addImm(0);
847 return;
848 }
849
850 if (DestReg == AMDGPU::VCC_LO) {
851 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
852 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
853 .addReg(SrcReg, getKillRegState(KillSrc));
854 } else {
855 // FIXME: Hack until VReg_1 removed.
856 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
857 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
858 .addImm(0)
859 .addReg(SrcReg, getKillRegState(KillSrc));
860 }
861
862 return;
863 }
864
865 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
866 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
867 return;
868 }
869
870 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
871 .addReg(SrcReg, getKillRegState(KillSrc));
872 return;
873 }
874
875 if (RC == &AMDGPU::SReg_64RegClass) {
876 if (SrcReg == AMDGPU::SCC) {
877 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
878 .addImm(1)
879 .addImm(0);
880 return;
881 }
882
883 if (DestReg == AMDGPU::VCC) {
884 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
885 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
886 .addReg(SrcReg, getKillRegState(KillSrc));
887 } else {
888 // FIXME: Hack until VReg_1 removed.
889 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
890 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
891 .addImm(0)
892 .addReg(SrcReg, getKillRegState(KillSrc));
893 }
894
895 return;
896 }
897
898 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
899 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
900 return;
901 }
902
903 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
904 .addReg(SrcReg, getKillRegState(KillSrc));
905 return;
906 }
907
908 if (DestReg == AMDGPU::SCC) {
909 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
910 // but SelectionDAG emits such copies for i1 sources.
911 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
912 // This copy can only be produced by patterns
913 // with explicit SCC, which are known to be enabled
914 // only for subtargets with S_CMP_LG_U64 present.
916 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
917 .addReg(SrcReg, getKillRegState(KillSrc))
918 .addImm(0);
919 } else {
920 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
921 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
922 .addReg(SrcReg, getKillRegState(KillSrc))
923 .addImm(0);
924 }
925
926 return;
927 }
928
929 if (RC == &AMDGPU::AGPR_32RegClass) {
930 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
931 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
932 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
933 .addReg(SrcReg, getKillRegState(KillSrc));
934 return;
935 }
936
937 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
938 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
939 .addReg(SrcReg, getKillRegState(KillSrc));
940 return;
941 }
942
943 // FIXME: Pass should maintain scavenger to avoid scan through the block on
944 // every AGPR spill.
945 RegScavenger RS;
946 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
947 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
948 return;
949 }
950
951 if (Size == 16) {
952 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
953 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
954 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
955
956 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
957 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
958 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
959 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
960 bool DstLow = !AMDGPU::isHi(DestReg, RI);
961 bool SrcLow = !AMDGPU::isHi(SrcReg, RI);
962 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
963 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
964
965 if (IsSGPRDst) {
966 if (!IsSGPRSrc) {
967 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
968 return;
969 }
970
971 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
972 .addReg(NewSrcReg, getKillRegState(KillSrc));
973 return;
974 }
975
976 if (IsAGPRDst || IsAGPRSrc) {
977 if (!DstLow || !SrcLow) {
978 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
979 "Cannot use hi16 subreg with an AGPR!");
980 }
981
982 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
983 return;
984 }
985
986 if (ST.hasTrue16BitInsts()) {
987 if (IsSGPRSrc) {
988 assert(SrcLow);
989 SrcReg = NewSrcReg;
990 }
991 // Use the smaller instruction encoding if possible.
992 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
993 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
994 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
995 .addReg(SrcReg);
996 } else {
997 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
998 .addImm(0) // src0_modifiers
999 .addReg(SrcReg)
1000 .addImm(0); // op_sel
1001 }
1002 return;
1003 }
1004
1005 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1006 if (!DstLow || !SrcLow) {
1007 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1008 "Cannot use hi16 subreg on VI!");
1009 }
1010
1011 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1012 .addReg(NewSrcReg, getKillRegState(KillSrc));
1013 return;
1014 }
1015
1016 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1017 .addImm(0) // src0_modifiers
1018 .addReg(NewSrcReg)
1019 .addImm(0) // clamp
1026 // First implicit operand is $exec.
1027 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1028 return;
1029 }
1030
1031 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1032 if (ST.hasMovB64()) {
1033 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1034 .addReg(SrcReg, getKillRegState(KillSrc));
1035 return;
1036 }
1037 if (ST.hasPkMovB32()) {
1038 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1040 .addReg(SrcReg)
1042 .addReg(SrcReg)
1043 .addImm(0) // op_sel_lo
1044 .addImm(0) // op_sel_hi
1045 .addImm(0) // neg_lo
1046 .addImm(0) // neg_hi
1047 .addImm(0) // clamp
1048 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1049 return;
1050 }
1051 }
1052
1053 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1054 if (RI.isSGPRClass(RC)) {
1055 if (!RI.isSGPRClass(SrcRC)) {
1056 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1057 return;
1058 }
1059 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1060 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1061 Forward);
1062 return;
1063 }
1064
1065 unsigned EltSize = 4;
1066 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1067 if (RI.isAGPRClass(RC)) {
1068 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1069 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1070 else if (RI.hasVGPRs(SrcRC) ||
1071 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1072 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1073 else
1074 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1075 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1076 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1077 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1078 (RI.isProperlyAlignedRC(*RC) &&
1079 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1080 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1081 if (ST.hasMovB64()) {
1082 Opcode = AMDGPU::V_MOV_B64_e32;
1083 EltSize = 8;
1084 } else if (ST.hasPkMovB32()) {
1085 Opcode = AMDGPU::V_PK_MOV_B32;
1086 EltSize = 8;
1087 }
1088 }
1089
1090 // For the cases where we need an intermediate instruction/temporary register
1091 // (destination is an AGPR), we need a scavenger.
1092 //
1093 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1094 // whole block for every handled copy.
1095 std::unique_ptr<RegScavenger> RS;
1096 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1097 RS.reset(new RegScavenger());
1098
1099 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1100
1101 // If there is an overlap, we can't kill the super-register on the last
1102 // instruction, since it will also kill the components made live by this def.
1103 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1104 const bool CanKillSuperReg = KillSrc && !Overlap;
1105
1106 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1107 unsigned SubIdx;
1108 if (Forward)
1109 SubIdx = SubIndices[Idx];
1110 else
1111 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1112 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1113 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1114 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1115
1116 bool IsFirstSubreg = Idx == 0;
1117 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1118
1119 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1120 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1121 Register ImpUseSuper = SrcReg;
1122 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1123 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1124 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1126 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1128 .addReg(SrcSubReg)
1130 .addReg(SrcSubReg)
1131 .addImm(0) // op_sel_lo
1132 .addImm(0) // op_sel_hi
1133 .addImm(0) // neg_lo
1134 .addImm(0) // neg_hi
1135 .addImm(0) // clamp
1136 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1137 if (IsFirstSubreg)
1139 } else {
1140 MachineInstrBuilder Builder =
1141 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1142 if (IsFirstSubreg)
1143 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1144
1145 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1146 }
1147 }
1148}
1149
1150int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1151 int NewOpc;
1152
1153 // Try to map original to commuted opcode
1154 NewOpc = AMDGPU::getCommuteRev(Opcode);
1155 if (NewOpc != -1)
1156 // Check if the commuted (REV) opcode exists on the target.
1157 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1158
1159 // Try to map commuted to original opcode
1160 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1161 if (NewOpc != -1)
1162 // Check if the original (non-REV) opcode exists on the target.
1163 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1164
1165 return Opcode;
1166}
1167
1170 const DebugLoc &DL, Register DestReg,
1171 int64_t Value) const {
1173 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1174 if (RegClass == &AMDGPU::SReg_32RegClass ||
1175 RegClass == &AMDGPU::SGPR_32RegClass ||
1176 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1177 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1178 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1179 .addImm(Value);
1180 return;
1181 }
1182
1183 if (RegClass == &AMDGPU::SReg_64RegClass ||
1184 RegClass == &AMDGPU::SGPR_64RegClass ||
1185 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1186 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1187 .addImm(Value);
1188 return;
1189 }
1190
1191 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1192 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1193 .addImm(Value);
1194 return;
1195 }
1196 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1197 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1198 .addImm(Value);
1199 return;
1200 }
1201
1202 unsigned EltSize = 4;
1203 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1204 if (RI.isSGPRClass(RegClass)) {
1205 if (RI.getRegSizeInBits(*RegClass) > 32) {
1206 Opcode = AMDGPU::S_MOV_B64;
1207 EltSize = 8;
1208 } else {
1209 Opcode = AMDGPU::S_MOV_B32;
1210 EltSize = 4;
1211 }
1212 }
1213
1214 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1215 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1216 int64_t IdxValue = Idx == 0 ? Value : 0;
1217
1218 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1219 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1220 Builder.addImm(IdxValue);
1221 }
1222}
1223
1224const TargetRegisterClass *
1226 return &AMDGPU::VGPR_32RegClass;
1227}
1228
1231 const DebugLoc &DL, Register DstReg,
1233 Register TrueReg,
1234 Register FalseReg) const {
1236 const TargetRegisterClass *BoolXExecRC =
1237 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1238 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1239 "Not a VGPR32 reg");
1240
1241 if (Cond.size() == 1) {
1242 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1243 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1244 .add(Cond[0]);
1245 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1246 .addImm(0)
1247 .addReg(FalseReg)
1248 .addImm(0)
1249 .addReg(TrueReg)
1250 .addReg(SReg);
1251 } else if (Cond.size() == 2) {
1252 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1253 switch (Cond[0].getImm()) {
1254 case SIInstrInfo::SCC_TRUE: {
1255 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1256 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1257 : AMDGPU::S_CSELECT_B64), SReg)
1258 .addImm(1)
1259 .addImm(0);
1260 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1261 .addImm(0)
1262 .addReg(FalseReg)
1263 .addImm(0)
1264 .addReg(TrueReg)
1265 .addReg(SReg);
1266 break;
1267 }
1268 case SIInstrInfo::SCC_FALSE: {
1269 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1270 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1271 : AMDGPU::S_CSELECT_B64), SReg)
1272 .addImm(0)
1273 .addImm(1);
1274 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1275 .addImm(0)
1276 .addReg(FalseReg)
1277 .addImm(0)
1278 .addReg(TrueReg)
1279 .addReg(SReg);
1280 break;
1281 }
1282 case SIInstrInfo::VCCNZ: {
1283 MachineOperand RegOp = Cond[1];
1284 RegOp.setImplicit(false);
1285 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1286 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1287 .add(RegOp);
1288 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1289 .addImm(0)
1290 .addReg(FalseReg)
1291 .addImm(0)
1292 .addReg(TrueReg)
1293 .addReg(SReg);
1294 break;
1295 }
1296 case SIInstrInfo::VCCZ: {
1297 MachineOperand RegOp = Cond[1];
1298 RegOp.setImplicit(false);
1299 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1300 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1301 .add(RegOp);
1302 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1303 .addImm(0)
1304 .addReg(TrueReg)
1305 .addImm(0)
1306 .addReg(FalseReg)
1307 .addReg(SReg);
1308 break;
1309 }
1310 case SIInstrInfo::EXECNZ: {
1311 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1312 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1313 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1314 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1315 .addImm(0);
1316 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1317 : AMDGPU::S_CSELECT_B64), SReg)
1318 .addImm(1)
1319 .addImm(0);
1320 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1321 .addImm(0)
1322 .addReg(FalseReg)
1323 .addImm(0)
1324 .addReg(TrueReg)
1325 .addReg(SReg);
1326 break;
1327 }
1328 case SIInstrInfo::EXECZ: {
1329 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1330 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1331 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1332 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1333 .addImm(0);
1334 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1335 : AMDGPU::S_CSELECT_B64), SReg)
1336 .addImm(0)
1337 .addImm(1);
1338 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1339 .addImm(0)
1340 .addReg(FalseReg)
1341 .addImm(0)
1342 .addReg(TrueReg)
1343 .addReg(SReg);
1344 llvm_unreachable("Unhandled branch predicate EXECZ");
1345 break;
1346 }
1347 default:
1348 llvm_unreachable("invalid branch predicate");
1349 }
1350 } else {
1351 llvm_unreachable("Can only handle Cond size 1 or 2");
1352 }
1353}
1354
1357 const DebugLoc &DL,
1358 Register SrcReg, int Value) const {
1360 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1361 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1362 .addImm(Value)
1363 .addReg(SrcReg);
1364
1365 return Reg;
1366}
1367
1370 const DebugLoc &DL,
1371 Register SrcReg, int Value) const {
1373 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1374 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1375 .addImm(Value)
1376 .addReg(SrcReg);
1377
1378 return Reg;
1379}
1380
1382
1383 if (RI.isAGPRClass(DstRC))
1384 return AMDGPU::COPY;
1385 if (RI.getRegSizeInBits(*DstRC) == 16) {
1386 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1387 // before RA.
1388 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1389 } else if (RI.getRegSizeInBits(*DstRC) == 32) {
1390 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1391 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
1392 return AMDGPU::S_MOV_B64;
1393 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
1394 return AMDGPU::V_MOV_B64_PSEUDO;
1395 }
1396 return AMDGPU::COPY;
1397}
1398
1399const MCInstrDesc &
1401 bool IsIndirectSrc) const {
1402 if (IsIndirectSrc) {
1403 if (VecSize <= 32) // 4 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1405 if (VecSize <= 64) // 8 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1407 if (VecSize <= 96) // 12 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1409 if (VecSize <= 128) // 16 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1411 if (VecSize <= 160) // 20 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1413 if (VecSize <= 256) // 32 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1415 if (VecSize <= 288) // 36 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1417 if (VecSize <= 320) // 40 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1419 if (VecSize <= 352) // 44 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1421 if (VecSize <= 384) // 48 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1423 if (VecSize <= 512) // 64 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1425 if (VecSize <= 1024) // 128 bytes
1426 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1427
1428 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1429 }
1430
1431 if (VecSize <= 32) // 4 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1433 if (VecSize <= 64) // 8 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1435 if (VecSize <= 96) // 12 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1437 if (VecSize <= 128) // 16 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1439 if (VecSize <= 160) // 20 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1441 if (VecSize <= 256) // 32 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1443 if (VecSize <= 288) // 36 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1445 if (VecSize <= 320) // 40 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1447 if (VecSize <= 352) // 44 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1449 if (VecSize <= 384) // 48 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1451 if (VecSize <= 512) // 64 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1453 if (VecSize <= 1024) // 128 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1455
1456 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1457}
1458
1459static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1460 if (VecSize <= 32) // 4 bytes
1461 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1462 if (VecSize <= 64) // 8 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1464 if (VecSize <= 96) // 12 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1466 if (VecSize <= 128) // 16 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1468 if (VecSize <= 160) // 20 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1470 if (VecSize <= 256) // 32 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1472 if (VecSize <= 288) // 36 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1474 if (VecSize <= 320) // 40 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1476 if (VecSize <= 352) // 44 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1478 if (VecSize <= 384) // 48 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1480 if (VecSize <= 512) // 64 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1482 if (VecSize <= 1024) // 128 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1484
1485 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1486}
1487
1488static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1489 if (VecSize <= 32) // 4 bytes
1490 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1491 if (VecSize <= 64) // 8 bytes
1492 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1493 if (VecSize <= 96) // 12 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1495 if (VecSize <= 128) // 16 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1497 if (VecSize <= 160) // 20 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1499 if (VecSize <= 256) // 32 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1501 if (VecSize <= 288) // 36 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1503 if (VecSize <= 320) // 40 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1505 if (VecSize <= 352) // 44 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1507 if (VecSize <= 384) // 48 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1509 if (VecSize <= 512) // 64 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1511 if (VecSize <= 1024) // 128 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1513
1514 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1515}
1516
1517static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1518 if (VecSize <= 64) // 8 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1520 if (VecSize <= 128) // 16 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1522 if (VecSize <= 256) // 32 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1524 if (VecSize <= 512) // 64 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1526 if (VecSize <= 1024) // 128 bytes
1527 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1528
1529 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1530}
1531
1532const MCInstrDesc &
1533SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1534 bool IsSGPR) const {
1535 if (IsSGPR) {
1536 switch (EltSize) {
1537 case 32:
1538 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1539 case 64:
1540 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1541 default:
1542 llvm_unreachable("invalid reg indexing elt size");
1543 }
1544 }
1545
1546 assert(EltSize == 32 && "invalid reg indexing elt size");
1548}
1549
1550static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1551 switch (Size) {
1552 case 4:
1553 return AMDGPU::SI_SPILL_S32_SAVE;
1554 case 8:
1555 return AMDGPU::SI_SPILL_S64_SAVE;
1556 case 12:
1557 return AMDGPU::SI_SPILL_S96_SAVE;
1558 case 16:
1559 return AMDGPU::SI_SPILL_S128_SAVE;
1560 case 20:
1561 return AMDGPU::SI_SPILL_S160_SAVE;
1562 case 24:
1563 return AMDGPU::SI_SPILL_S192_SAVE;
1564 case 28:
1565 return AMDGPU::SI_SPILL_S224_SAVE;
1566 case 32:
1567 return AMDGPU::SI_SPILL_S256_SAVE;
1568 case 36:
1569 return AMDGPU::SI_SPILL_S288_SAVE;
1570 case 40:
1571 return AMDGPU::SI_SPILL_S320_SAVE;
1572 case 44:
1573 return AMDGPU::SI_SPILL_S352_SAVE;
1574 case 48:
1575 return AMDGPU::SI_SPILL_S384_SAVE;
1576 case 64:
1577 return AMDGPU::SI_SPILL_S512_SAVE;
1578 case 128:
1579 return AMDGPU::SI_SPILL_S1024_SAVE;
1580 default:
1581 llvm_unreachable("unknown register size");
1582 }
1583}
1584
1585static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1586 switch (Size) {
1587 case 4:
1588 return AMDGPU::SI_SPILL_V32_SAVE;
1589 case 8:
1590 return AMDGPU::SI_SPILL_V64_SAVE;
1591 case 12:
1592 return AMDGPU::SI_SPILL_V96_SAVE;
1593 case 16:
1594 return AMDGPU::SI_SPILL_V128_SAVE;
1595 case 20:
1596 return AMDGPU::SI_SPILL_V160_SAVE;
1597 case 24:
1598 return AMDGPU::SI_SPILL_V192_SAVE;
1599 case 28:
1600 return AMDGPU::SI_SPILL_V224_SAVE;
1601 case 32:
1602 return AMDGPU::SI_SPILL_V256_SAVE;
1603 case 36:
1604 return AMDGPU::SI_SPILL_V288_SAVE;
1605 case 40:
1606 return AMDGPU::SI_SPILL_V320_SAVE;
1607 case 44:
1608 return AMDGPU::SI_SPILL_V352_SAVE;
1609 case 48:
1610 return AMDGPU::SI_SPILL_V384_SAVE;
1611 case 64:
1612 return AMDGPU::SI_SPILL_V512_SAVE;
1613 case 128:
1614 return AMDGPU::SI_SPILL_V1024_SAVE;
1615 default:
1616 llvm_unreachable("unknown register size");
1617 }
1618}
1619
1620static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1621 switch (Size) {
1622 case 4:
1623 return AMDGPU::SI_SPILL_A32_SAVE;
1624 case 8:
1625 return AMDGPU::SI_SPILL_A64_SAVE;
1626 case 12:
1627 return AMDGPU::SI_SPILL_A96_SAVE;
1628 case 16:
1629 return AMDGPU::SI_SPILL_A128_SAVE;
1630 case 20:
1631 return AMDGPU::SI_SPILL_A160_SAVE;
1632 case 24:
1633 return AMDGPU::SI_SPILL_A192_SAVE;
1634 case 28:
1635 return AMDGPU::SI_SPILL_A224_SAVE;
1636 case 32:
1637 return AMDGPU::SI_SPILL_A256_SAVE;
1638 case 36:
1639 return AMDGPU::SI_SPILL_A288_SAVE;
1640 case 40:
1641 return AMDGPU::SI_SPILL_A320_SAVE;
1642 case 44:
1643 return AMDGPU::SI_SPILL_A352_SAVE;
1644 case 48:
1645 return AMDGPU::SI_SPILL_A384_SAVE;
1646 case 64:
1647 return AMDGPU::SI_SPILL_A512_SAVE;
1648 case 128:
1649 return AMDGPU::SI_SPILL_A1024_SAVE;
1650 default:
1651 llvm_unreachable("unknown register size");
1652 }
1653}
1654
1655static unsigned getAVSpillSaveOpcode(unsigned Size) {
1656 switch (Size) {
1657 case 4:
1658 return AMDGPU::SI_SPILL_AV32_SAVE;
1659 case 8:
1660 return AMDGPU::SI_SPILL_AV64_SAVE;
1661 case 12:
1662 return AMDGPU::SI_SPILL_AV96_SAVE;
1663 case 16:
1664 return AMDGPU::SI_SPILL_AV128_SAVE;
1665 case 20:
1666 return AMDGPU::SI_SPILL_AV160_SAVE;
1667 case 24:
1668 return AMDGPU::SI_SPILL_AV192_SAVE;
1669 case 28:
1670 return AMDGPU::SI_SPILL_AV224_SAVE;
1671 case 32:
1672 return AMDGPU::SI_SPILL_AV256_SAVE;
1673 case 36:
1674 return AMDGPU::SI_SPILL_AV288_SAVE;
1675 case 40:
1676 return AMDGPU::SI_SPILL_AV320_SAVE;
1677 case 44:
1678 return AMDGPU::SI_SPILL_AV352_SAVE;
1679 case 48:
1680 return AMDGPU::SI_SPILL_AV384_SAVE;
1681 case 64:
1682 return AMDGPU::SI_SPILL_AV512_SAVE;
1683 case 128:
1684 return AMDGPU::SI_SPILL_AV1024_SAVE;
1685 default:
1686 llvm_unreachable("unknown register size");
1687 }
1688}
1689
1690static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1691 bool IsVectorSuperClass) {
1692 // Currently, there is only 32-bit WWM register spills needed.
1693 if (Size != 4)
1694 llvm_unreachable("unknown wwm register spill size");
1695
1696 if (IsVectorSuperClass)
1697 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1698
1699 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1700}
1701
1703 const TargetRegisterClass *RC,
1704 unsigned Size,
1705 const SIRegisterInfo &TRI,
1706 const SIMachineFunctionInfo &MFI) {
1707 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1708
1709 // Choose the right opcode if spilling a WWM register.
1711 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1712
1713 if (IsVectorSuperClass)
1714 return getAVSpillSaveOpcode(Size);
1715
1716 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1718}
1719
1722 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1723 const TargetRegisterInfo *TRI, Register VReg) const {
1726 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1727 const DebugLoc &DL = MBB.findDebugLoc(MI);
1728
1729 MachinePointerInfo PtrInfo
1730 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1732 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1733 FrameInfo.getObjectAlign(FrameIndex));
1734 unsigned SpillSize = TRI->getSpillSize(*RC);
1735
1737 if (RI.isSGPRClass(RC)) {
1738 MFI->setHasSpilledSGPRs();
1739 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1740 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1741 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1742
1743 // We are only allowed to create one new instruction when spilling
1744 // registers, so we need to use pseudo instruction for spilling SGPRs.
1745 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1746
1747 // The SGPR spill/restore instructions only work on number sgprs, so we need
1748 // to make sure we are using the correct register class.
1749 if (SrcReg.isVirtual() && SpillSize == 4) {
1750 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1751 }
1752
1753 BuildMI(MBB, MI, DL, OpDesc)
1754 .addReg(SrcReg, getKillRegState(isKill)) // data
1755 .addFrameIndex(FrameIndex) // addr
1756 .addMemOperand(MMO)
1758
1759 if (RI.spillSGPRToVGPR())
1760 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1761 return;
1762 }
1763
1764 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1765 SpillSize, RI, *MFI);
1766 MFI->setHasSpilledVGPRs();
1767
1768 BuildMI(MBB, MI, DL, get(Opcode))
1769 .addReg(SrcReg, getKillRegState(isKill)) // data
1770 .addFrameIndex(FrameIndex) // addr
1771 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1772 .addImm(0) // offset
1773 .addMemOperand(MMO);
1774}
1775
1776static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1777 switch (Size) {
1778 case 4:
1779 return AMDGPU::SI_SPILL_S32_RESTORE;
1780 case 8:
1781 return AMDGPU::SI_SPILL_S64_RESTORE;
1782 case 12:
1783 return AMDGPU::SI_SPILL_S96_RESTORE;
1784 case 16:
1785 return AMDGPU::SI_SPILL_S128_RESTORE;
1786 case 20:
1787 return AMDGPU::SI_SPILL_S160_RESTORE;
1788 case 24:
1789 return AMDGPU::SI_SPILL_S192_RESTORE;
1790 case 28:
1791 return AMDGPU::SI_SPILL_S224_RESTORE;
1792 case 32:
1793 return AMDGPU::SI_SPILL_S256_RESTORE;
1794 case 36:
1795 return AMDGPU::SI_SPILL_S288_RESTORE;
1796 case 40:
1797 return AMDGPU::SI_SPILL_S320_RESTORE;
1798 case 44:
1799 return AMDGPU::SI_SPILL_S352_RESTORE;
1800 case 48:
1801 return AMDGPU::SI_SPILL_S384_RESTORE;
1802 case 64:
1803 return AMDGPU::SI_SPILL_S512_RESTORE;
1804 case 128:
1805 return AMDGPU::SI_SPILL_S1024_RESTORE;
1806 default:
1807 llvm_unreachable("unknown register size");
1808 }
1809}
1810
1811static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1812 switch (Size) {
1813 case 4:
1814 return AMDGPU::SI_SPILL_V32_RESTORE;
1815 case 8:
1816 return AMDGPU::SI_SPILL_V64_RESTORE;
1817 case 12:
1818 return AMDGPU::SI_SPILL_V96_RESTORE;
1819 case 16:
1820 return AMDGPU::SI_SPILL_V128_RESTORE;
1821 case 20:
1822 return AMDGPU::SI_SPILL_V160_RESTORE;
1823 case 24:
1824 return AMDGPU::SI_SPILL_V192_RESTORE;
1825 case 28:
1826 return AMDGPU::SI_SPILL_V224_RESTORE;
1827 case 32:
1828 return AMDGPU::SI_SPILL_V256_RESTORE;
1829 case 36:
1830 return AMDGPU::SI_SPILL_V288_RESTORE;
1831 case 40:
1832 return AMDGPU::SI_SPILL_V320_RESTORE;
1833 case 44:
1834 return AMDGPU::SI_SPILL_V352_RESTORE;
1835 case 48:
1836 return AMDGPU::SI_SPILL_V384_RESTORE;
1837 case 64:
1838 return AMDGPU::SI_SPILL_V512_RESTORE;
1839 case 128:
1840 return AMDGPU::SI_SPILL_V1024_RESTORE;
1841 default:
1842 llvm_unreachable("unknown register size");
1843 }
1844}
1845
1846static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1847 switch (Size) {
1848 case 4:
1849 return AMDGPU::SI_SPILL_A32_RESTORE;
1850 case 8:
1851 return AMDGPU::SI_SPILL_A64_RESTORE;
1852 case 12:
1853 return AMDGPU::SI_SPILL_A96_RESTORE;
1854 case 16:
1855 return AMDGPU::SI_SPILL_A128_RESTORE;
1856 case 20:
1857 return AMDGPU::SI_SPILL_A160_RESTORE;
1858 case 24:
1859 return AMDGPU::SI_SPILL_A192_RESTORE;
1860 case 28:
1861 return AMDGPU::SI_SPILL_A224_RESTORE;
1862 case 32:
1863 return AMDGPU::SI_SPILL_A256_RESTORE;
1864 case 36:
1865 return AMDGPU::SI_SPILL_A288_RESTORE;
1866 case 40:
1867 return AMDGPU::SI_SPILL_A320_RESTORE;
1868 case 44:
1869 return AMDGPU::SI_SPILL_A352_RESTORE;
1870 case 48:
1871 return AMDGPU::SI_SPILL_A384_RESTORE;
1872 case 64:
1873 return AMDGPU::SI_SPILL_A512_RESTORE;
1874 case 128:
1875 return AMDGPU::SI_SPILL_A1024_RESTORE;
1876 default:
1877 llvm_unreachable("unknown register size");
1878 }
1879}
1880
1881static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1882 switch (Size) {
1883 case 4:
1884 return AMDGPU::SI_SPILL_AV32_RESTORE;
1885 case 8:
1886 return AMDGPU::SI_SPILL_AV64_RESTORE;
1887 case 12:
1888 return AMDGPU::SI_SPILL_AV96_RESTORE;
1889 case 16:
1890 return AMDGPU::SI_SPILL_AV128_RESTORE;
1891 case 20:
1892 return AMDGPU::SI_SPILL_AV160_RESTORE;
1893 case 24:
1894 return AMDGPU::SI_SPILL_AV192_RESTORE;
1895 case 28:
1896 return AMDGPU::SI_SPILL_AV224_RESTORE;
1897 case 32:
1898 return AMDGPU::SI_SPILL_AV256_RESTORE;
1899 case 36:
1900 return AMDGPU::SI_SPILL_AV288_RESTORE;
1901 case 40:
1902 return AMDGPU::SI_SPILL_AV320_RESTORE;
1903 case 44:
1904 return AMDGPU::SI_SPILL_AV352_RESTORE;
1905 case 48:
1906 return AMDGPU::SI_SPILL_AV384_RESTORE;
1907 case 64:
1908 return AMDGPU::SI_SPILL_AV512_RESTORE;
1909 case 128:
1910 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1911 default:
1912 llvm_unreachable("unknown register size");
1913 }
1914}
1915
1916static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1917 bool IsVectorSuperClass) {
1918 // Currently, there is only 32-bit WWM register spills needed.
1919 if (Size != 4)
1920 llvm_unreachable("unknown wwm register spill size");
1921
1922 if (IsVectorSuperClass)
1923 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1924
1925 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1926}
1927
1928static unsigned
1930 unsigned Size, const SIRegisterInfo &TRI,
1931 const SIMachineFunctionInfo &MFI) {
1932 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1933
1934 // Choose the right opcode if restoring a WWM register.
1936 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1937
1938 if (IsVectorSuperClass)
1940
1941 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1943}
1944
1947 Register DestReg, int FrameIndex,
1948 const TargetRegisterClass *RC,
1949 const TargetRegisterInfo *TRI,
1950 Register VReg) const {
1953 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1954 const DebugLoc &DL = MBB.findDebugLoc(MI);
1955 unsigned SpillSize = TRI->getSpillSize(*RC);
1956
1957 MachinePointerInfo PtrInfo
1958 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1959
1961 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1962 FrameInfo.getObjectAlign(FrameIndex));
1963
1964 if (RI.isSGPRClass(RC)) {
1965 MFI->setHasSpilledSGPRs();
1966 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1967 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1968 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1969
1970 // FIXME: Maybe this should not include a memoperand because it will be
1971 // lowered to non-memory instructions.
1972 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1973 if (DestReg.isVirtual() && SpillSize == 4) {
1975 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1976 }
1977
1978 if (RI.spillSGPRToVGPR())
1979 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1980 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1981 .addFrameIndex(FrameIndex) // addr
1982 .addMemOperand(MMO)
1984
1985 return;
1986 }
1987
1988 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1989 SpillSize, RI, *MFI);
1990 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1991 .addFrameIndex(FrameIndex) // vaddr
1992 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1993 .addImm(0) // offset
1994 .addMemOperand(MMO);
1995}
1996
1999 insertNoops(MBB, MI, 1);
2000}
2001
2004 unsigned Quantity) const {
2006 while (Quantity > 0) {
2007 unsigned Arg = std::min(Quantity, 8u);
2008 Quantity -= Arg;
2009 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2010 }
2011}
2012
2014 auto MF = MBB.getParent();
2016
2017 assert(Info->isEntryFunction());
2018
2019 if (MBB.succ_empty()) {
2020 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2021 if (HasNoTerminator) {
2022 if (Info->returnsVoid()) {
2023 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2024 } else {
2025 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2026 }
2027 }
2028 }
2029}
2030
2034 const DebugLoc &DL) const {
2036 MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2038 MF->push_back(HaltLoop);
2039
2040 constexpr unsigned DoorbellIDMask = 0x3ff;
2041 constexpr unsigned ECQueueWaveAbort = 0x400;
2042
2043 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2044 // will be a nop.
2045 BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP))
2046 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2047 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2048 BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG_RTN_B32), DoorbellReg)
2050 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2051 .addUse(AMDGPU::M0);
2052 Register DoorbellRegMasked =
2053 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2054 BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2055 .addUse(DoorbellReg)
2056 .addImm(DoorbellIDMask);
2057 Register SetWaveAbortBit =
2058 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2059 BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2060 .addUse(DoorbellRegMasked)
2061 .addImm(ECQueueWaveAbort);
2062 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2063 .addUse(SetWaveAbortBit);
2064 BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG))
2066 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2067 .addUse(AMDGPU::TTMP2);
2068 BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop);
2069
2070 BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2071 BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH))
2072 .addMBB(HaltLoop);
2073
2074 if (SplitBB != &MBB)
2075 MBB.removeSuccessor(SplitBB);
2076 MBB.addSuccessor(HaltLoop);
2077 HaltLoop->addSuccessor(HaltLoop);
2078
2079 return SplitBB;
2080}
2081
2083 switch (MI.getOpcode()) {
2084 default:
2085 if (MI.isMetaInstruction())
2086 return 0;
2087 return 1; // FIXME: Do wait states equal cycles?
2088
2089 case AMDGPU::S_NOP:
2090 return MI.getOperand(0).getImm() + 1;
2091 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2092 // hazard, even if one exist, won't really be visible. Should we handle it?
2093 }
2094}
2095
2097 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2098 MachineBasicBlock &MBB = *MI.getParent();
2100 switch (MI.getOpcode()) {
2101 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2102 case AMDGPU::S_MOV_B64_term:
2103 // This is only a terminator to get the correct spill code placement during
2104 // register allocation.
2105 MI.setDesc(get(AMDGPU::S_MOV_B64));
2106 break;
2107
2108 case AMDGPU::S_MOV_B32_term:
2109 // This is only a terminator to get the correct spill code placement during
2110 // register allocation.
2111 MI.setDesc(get(AMDGPU::S_MOV_B32));
2112 break;
2113
2114 case AMDGPU::S_XOR_B64_term:
2115 // This is only a terminator to get the correct spill code placement during
2116 // register allocation.
2117 MI.setDesc(get(AMDGPU::S_XOR_B64));
2118 break;
2119
2120 case AMDGPU::S_XOR_B32_term:
2121 // This is only a terminator to get the correct spill code placement during
2122 // register allocation.
2123 MI.setDesc(get(AMDGPU::S_XOR_B32));
2124 break;
2125 case AMDGPU::S_OR_B64_term:
2126 // This is only a terminator to get the correct spill code placement during
2127 // register allocation.
2128 MI.setDesc(get(AMDGPU::S_OR_B64));
2129 break;
2130 case AMDGPU::S_OR_B32_term:
2131 // This is only a terminator to get the correct spill code placement during
2132 // register allocation.
2133 MI.setDesc(get(AMDGPU::S_OR_B32));
2134 break;
2135
2136 case AMDGPU::S_ANDN2_B64_term:
2137 // This is only a terminator to get the correct spill code placement during
2138 // register allocation.
2139 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2140 break;
2141
2142 case AMDGPU::S_ANDN2_B32_term:
2143 // This is only a terminator to get the correct spill code placement during
2144 // register allocation.
2145 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2146 break;
2147
2148 case AMDGPU::S_AND_B64_term:
2149 // This is only a terminator to get the correct spill code placement during
2150 // register allocation.
2151 MI.setDesc(get(AMDGPU::S_AND_B64));
2152 break;
2153
2154 case AMDGPU::S_AND_B32_term:
2155 // This is only a terminator to get the correct spill code placement during
2156 // register allocation.
2157 MI.setDesc(get(AMDGPU::S_AND_B32));
2158 break;
2159
2160 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2161 // This is only a terminator to get the correct spill code placement during
2162 // register allocation.
2163 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2164 break;
2165
2166 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2167 // This is only a terminator to get the correct spill code placement during
2168 // register allocation.
2169 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2170 break;
2171
2172 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2173 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2174 break;
2175
2176 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2177 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2178 break;
2179
2180 case AMDGPU::V_MOV_B64_PSEUDO: {
2181 Register Dst = MI.getOperand(0).getReg();
2182 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2183 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2184
2185 const MachineOperand &SrcOp = MI.getOperand(1);
2186 // FIXME: Will this work for 64-bit floating point immediates?
2187 assert(!SrcOp.isFPImm());
2188 if (ST.hasMovB64()) {
2189 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2190 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2191 isUInt<32>(SrcOp.getImm()))
2192 break;
2193 }
2194 if (SrcOp.isImm()) {
2195 APInt Imm(64, SrcOp.getImm());
2196 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2197 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2198 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2199 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2201 .addImm(Lo.getSExtValue())
2203 .addImm(Lo.getSExtValue())
2204 .addImm(0) // op_sel_lo
2205 .addImm(0) // op_sel_hi
2206 .addImm(0) // neg_lo
2207 .addImm(0) // neg_hi
2208 .addImm(0); // clamp
2209 } else {
2210 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2211 .addImm(Lo.getSExtValue())
2213 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2214 .addImm(Hi.getSExtValue())
2216 }
2217 } else {
2218 assert(SrcOp.isReg());
2219 if (ST.hasPkMovB32() &&
2220 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2221 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2222 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2223 .addReg(SrcOp.getReg())
2225 .addReg(SrcOp.getReg())
2226 .addImm(0) // op_sel_lo
2227 .addImm(0) // op_sel_hi
2228 .addImm(0) // neg_lo
2229 .addImm(0) // neg_hi
2230 .addImm(0); // clamp
2231 } else {
2232 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2233 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2235 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2236 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2238 }
2239 }
2240 MI.eraseFromParent();
2241 break;
2242 }
2243 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2245 break;
2246 }
2247 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2248 const MachineOperand &SrcOp = MI.getOperand(1);
2249 assert(!SrcOp.isFPImm());
2250 APInt Imm(64, SrcOp.getImm());
2251 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2252 MI.setDesc(get(AMDGPU::S_MOV_B64));
2253 break;
2254 }
2255
2256 Register Dst = MI.getOperand(0).getReg();
2257 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2258 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2259
2260 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2261 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2262 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2263 .addImm(Lo.getSExtValue())
2265 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2266 .addImm(Hi.getSExtValue())
2268 MI.eraseFromParent();
2269 break;
2270 }
2271 case AMDGPU::V_SET_INACTIVE_B32: {
2272 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2273 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2274 // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2275 // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2276 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2277 .add(MI.getOperand(1));
2278 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2279 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2280 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2281 .add(MI.getOperand(2));
2282 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2283 .addReg(Exec);
2284 MI.eraseFromParent();
2285 break;
2286 }
2287 case AMDGPU::V_SET_INACTIVE_B64: {
2288 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2289 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2290 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2291 MI.getOperand(0).getReg())
2292 .add(MI.getOperand(1));
2293 expandPostRAPseudo(*Copy);
2294 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2295 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2296 Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2297 MI.getOperand(0).getReg())
2298 .add(MI.getOperand(2));
2299 expandPostRAPseudo(*Copy);
2300 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2301 .addReg(Exec);
2302 MI.eraseFromParent();
2303 break;
2304 }
2305 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2306 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2307 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2308 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2309 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2310 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2311 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2324 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2325 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2326 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2327 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2328 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2329 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2330 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2331 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2332 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2333 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2334 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2335
2336 unsigned Opc;
2337 if (RI.hasVGPRs(EltRC)) {
2338 Opc = AMDGPU::V_MOVRELD_B32_e32;
2339 } else {
2340 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2341 : AMDGPU::S_MOVRELD_B32;
2342 }
2343
2344 const MCInstrDesc &OpDesc = get(Opc);
2345 Register VecReg = MI.getOperand(0).getReg();
2346 bool IsUndef = MI.getOperand(1).isUndef();
2347 unsigned SubReg = MI.getOperand(3).getImm();
2348 assert(VecReg == MI.getOperand(1).getReg());
2349
2351 BuildMI(MBB, MI, DL, OpDesc)
2352 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2353 .add(MI.getOperand(2))
2355 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2356
2357 const int ImpDefIdx =
2358 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2359 const int ImpUseIdx = ImpDefIdx + 1;
2360 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2361 MI.eraseFromParent();
2362 break;
2363 }
2364 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2365 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2366 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2367 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2368 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2369 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2370 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2371 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2372 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2373 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2374 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2375 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2377 Register VecReg = MI.getOperand(0).getReg();
2378 bool IsUndef = MI.getOperand(1).isUndef();
2379 Register Idx = MI.getOperand(3).getReg();
2380 Register SubReg = MI.getOperand(4).getImm();
2381
2382 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2383 .addReg(Idx)
2385 SetOn->getOperand(3).setIsUndef();
2386
2387 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2389 BuildMI(MBB, MI, DL, OpDesc)
2390 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2391 .add(MI.getOperand(2))
2393 .addReg(VecReg,
2394 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2395
2396 const int ImpDefIdx =
2397 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2398 const int ImpUseIdx = ImpDefIdx + 1;
2399 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2400
2401 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2402
2403 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2404
2405 MI.eraseFromParent();
2406 break;
2407 }
2408 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2409 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2410 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2411 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2412 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2413 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2414 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2415 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2416 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2417 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2418 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2419 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2421 Register Dst = MI.getOperand(0).getReg();
2422 Register VecReg = MI.getOperand(1).getReg();
2423 bool IsUndef = MI.getOperand(1).isUndef();
2424 Register Idx = MI.getOperand(2).getReg();
2425 Register SubReg = MI.getOperand(3).getImm();
2426
2427 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2428 .addReg(Idx)
2430 SetOn->getOperand(3).setIsUndef();
2431
2432 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2433 .addDef(Dst)
2434 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2435 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2436
2437 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2438
2439 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2440
2441 MI.eraseFromParent();
2442 break;
2443 }
2444 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2445 MachineFunction &MF = *MBB.getParent();
2446 Register Reg = MI.getOperand(0).getReg();
2447 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2448 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2449 MachineOperand OpLo = MI.getOperand(1);
2450 MachineOperand OpHi = MI.getOperand(2);
2451
2452 // Create a bundle so these instructions won't be re-ordered by the
2453 // post-RA scheduler.
2454 MIBundleBuilder Bundler(MBB, MI);
2455 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2456
2457 // What we want here is an offset from the value returned by s_getpc (which
2458 // is the address of the s_add_u32 instruction) to the global variable, but
2459 // since the encoding of $symbol starts 4 bytes after the start of the
2460 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2461 // small. This requires us to add 4 to the global variable offset in order
2462 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2463 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2464 // instruction.
2465
2466 int64_t Adjust = 0;
2467 if (ST.hasGetPCZeroExtension()) {
2468 // Fix up hardware that does not sign-extend the 48-bit PC value by
2469 // inserting: s_sext_i32_i16 reghi, reghi
2470 Bundler.append(
2471 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2472 Adjust += 4;
2473 }
2474
2475 if (OpLo.isGlobal())
2476 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2477 Bundler.append(
2478 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2479
2480 if (OpHi.isGlobal())
2481 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2482 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2483 .addReg(RegHi)
2484 .add(OpHi));
2485
2486 finalizeBundle(MBB, Bundler.begin());
2487
2488 MI.eraseFromParent();
2489 break;
2490 }
2491 case AMDGPU::ENTER_STRICT_WWM: {
2492 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2493 // Whole Wave Mode is entered.
2494 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2495 : AMDGPU::S_OR_SAVEEXEC_B64));
2496 break;
2497 }
2498 case AMDGPU::ENTER_STRICT_WQM: {
2499 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2500 // STRICT_WQM is entered.
2501 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2502 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2503 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2504 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2505 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2506
2507 MI.eraseFromParent();
2508 break;
2509 }
2510 case AMDGPU::EXIT_STRICT_WWM:
2511 case AMDGPU::EXIT_STRICT_WQM: {
2512 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2513 // WWM/STICT_WQM is exited.
2514 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2515 break;
2516 }
2517 case AMDGPU::ENTER_PSEUDO_WM:
2518 case AMDGPU::EXIT_PSEUDO_WM: {
2519 // These do nothing.
2520 MI.eraseFromParent();
2521 break;
2522 }
2523 case AMDGPU::SI_RETURN: {
2524 const MachineFunction *MF = MBB.getParent();
2525 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2526 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2527 // Hiding the return address use with SI_RETURN may lead to extra kills in
2528 // the function and missing live-ins. We are fine in practice because callee
2529 // saved register handling ensures the register value is restored before
2530 // RET, but we need the undef flag here to appease the MachineVerifier
2531 // liveness checks.
2533 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2534 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2535
2536 MIB.copyImplicitOps(MI);
2537 MI.eraseFromParent();
2538 break;
2539 }
2540
2541 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2542 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2543 MI.setDesc(get(AMDGPU::S_MUL_U64));
2544 break;
2545
2546 case AMDGPU::S_GETPC_B64_pseudo:
2547 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2548 if (ST.hasGetPCZeroExtension()) {
2549 Register Dst = MI.getOperand(0).getReg();
2550 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2551 // Fix up hardware that does not sign-extend the 48-bit PC value by
2552 // inserting: s_sext_i32_i16 dsthi, dsthi
2553 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2554 DstHi)
2555 .addReg(DstHi);
2556 }
2557 break;
2558 }
2559 return true;
2560}
2561
2564 unsigned SubIdx, const MachineInstr &Orig,
2565 const TargetRegisterInfo &RI) const {
2566
2567 // Try shrinking the instruction to remat only the part needed for current
2568 // context.
2569 // TODO: Handle more cases.
2570 unsigned Opcode = Orig.getOpcode();
2571 switch (Opcode) {
2572 case AMDGPU::S_LOAD_DWORDX16_IMM:
2573 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2574 if (SubIdx != 0)
2575 break;
2576
2577 if (I == MBB.end())
2578 break;
2579
2580 if (I->isBundled())
2581 break;
2582
2583 // Look for a single use of the register that is also a subreg.
2584 Register RegToFind = Orig.getOperand(0).getReg();
2585 MachineOperand *UseMO = nullptr;
2586 for (auto &CandMO : I->operands()) {
2587 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2588 continue;
2589 if (UseMO) {
2590 UseMO = nullptr;
2591 break;
2592 }
2593 UseMO = &CandMO;
2594 }
2595 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2596 break;
2597
2598 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2599 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2600
2603 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2604
2605 unsigned NewOpcode = -1;
2606 if (SubregSize == 256)
2607 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2608 else if (SubregSize == 128)
2609 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2610 else
2611 break;
2612
2613 const MCInstrDesc &TID = get(NewOpcode);
2614 const TargetRegisterClass *NewRC =
2615 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2616 MRI.setRegClass(DestReg, NewRC);
2617
2618 UseMO->setReg(DestReg);
2619 UseMO->setSubReg(AMDGPU::NoSubRegister);
2620
2621 // Use a smaller load with the desired size, possibly with updated offset.
2622 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2623 MI->setDesc(TID);
2624 MI->getOperand(0).setReg(DestReg);
2625 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2626 if (Offset) {
2627 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2628 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2629 OffsetMO->setImm(FinalOffset);
2630 }
2632 for (const MachineMemOperand *MemOp : Orig.memoperands())
2633 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2634 SubregSize / 8));
2635 MI->setMemRefs(*MF, NewMMOs);
2636
2637 MBB.insert(I, MI);
2638 return;
2639 }
2640
2641 default:
2642 break;
2643 }
2644
2645 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2646}
2647
2648std::pair<MachineInstr*, MachineInstr*>
2650 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2651
2652 if (ST.hasMovB64() &&
2654 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2655 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2656 return std::pair(&MI, nullptr);
2657 }
2658
2659 MachineBasicBlock &MBB = *MI.getParent();
2663 Register Dst = MI.getOperand(0).getReg();
2664 unsigned Part = 0;
2665 MachineInstr *Split[2];
2666
2667 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2668 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2669 if (Dst.isPhysical()) {
2670 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2671 } else {
2672 assert(MRI.isSSA());
2673 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2674 MovDPP.addDef(Tmp);
2675 }
2676
2677 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2678 const MachineOperand &SrcOp = MI.getOperand(I);
2679 assert(!SrcOp.isFPImm());
2680 if (SrcOp.isImm()) {
2681 APInt Imm(64, SrcOp.getImm());
2682 Imm.ashrInPlace(Part * 32);
2683 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2684 } else {
2685 assert(SrcOp.isReg());
2686 Register Src = SrcOp.getReg();
2687 if (Src.isPhysical())
2688 MovDPP.addReg(RI.getSubReg(Src, Sub));
2689 else
2690 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2691 }
2692 }
2693
2694 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2695 MovDPP.addImm(MO.getImm());
2696
2697 Split[Part] = MovDPP;
2698 ++Part;
2699 }
2700
2701 if (Dst.isVirtual())
2702 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2703 .addReg(Split[0]->getOperand(0).getReg())
2704 .addImm(AMDGPU::sub0)
2705 .addReg(Split[1]->getOperand(0).getReg())
2706 .addImm(AMDGPU::sub1);
2707
2708 MI.eraseFromParent();
2709 return std::pair(Split[0], Split[1]);
2710}
2711
2712std::optional<DestSourcePair>
2714 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2715 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2716
2717 return std::nullopt;
2718}
2719
2721 MachineOperand &Src0,
2722 unsigned Src0OpName,
2723 MachineOperand &Src1,
2724 unsigned Src1OpName) const {
2725 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2726 if (!Src0Mods)
2727 return false;
2728
2729 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2730 assert(Src1Mods &&
2731 "All commutable instructions have both src0 and src1 modifiers");
2732
2733 int Src0ModsVal = Src0Mods->getImm();
2734 int Src1ModsVal = Src1Mods->getImm();
2735
2736 Src1Mods->setImm(Src0ModsVal);
2737 Src0Mods->setImm(Src1ModsVal);
2738 return true;
2739}
2740
2742 MachineOperand &RegOp,
2743 MachineOperand &NonRegOp) {
2744 Register Reg = RegOp.getReg();
2745 unsigned SubReg = RegOp.getSubReg();
2746 bool IsKill = RegOp.isKill();
2747 bool IsDead = RegOp.isDead();
2748 bool IsUndef = RegOp.isUndef();
2749 bool IsDebug = RegOp.isDebug();
2750
2751 if (NonRegOp.isImm())
2752 RegOp.ChangeToImmediate(NonRegOp.getImm());
2753 else if (NonRegOp.isFI())
2754 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2755 else if (NonRegOp.isGlobal()) {
2756 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2757 NonRegOp.getTargetFlags());
2758 } else
2759 return nullptr;
2760
2761 // Make sure we don't reinterpret a subreg index in the target flags.
2762 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2763
2764 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2765 NonRegOp.setSubReg(SubReg);
2766
2767 return &MI;
2768}
2769
2771 unsigned Src0Idx,
2772 unsigned Src1Idx) const {
2773 assert(!NewMI && "this should never be used");
2774
2775 unsigned Opc = MI.getOpcode();
2776 int CommutedOpcode = commuteOpcode(Opc);
2777 if (CommutedOpcode == -1)
2778 return nullptr;
2779
2780 if (Src0Idx > Src1Idx)
2781 std::swap(Src0Idx, Src1Idx);
2782
2783 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2784 static_cast<int>(Src0Idx) &&
2785 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2786 static_cast<int>(Src1Idx) &&
2787 "inconsistency with findCommutedOpIndices");
2788
2789 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2790 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2791
2792 MachineInstr *CommutedMI = nullptr;
2793 if (Src0.isReg() && Src1.isReg()) {
2794 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2795 // Be sure to copy the source modifiers to the right place.
2796 CommutedMI
2797 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2798 }
2799
2800 } else if (Src0.isReg() && !Src1.isReg()) {
2801 // src0 should always be able to support any operand type, so no need to
2802 // check operand legality.
2803 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2804 } else if (!Src0.isReg() && Src1.isReg()) {
2805 if (isOperandLegal(MI, Src1Idx, &Src0))
2806 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2807 } else {
2808 // FIXME: Found two non registers to commute. This does happen.
2809 return nullptr;
2810 }
2811
2812 if (CommutedMI) {
2813 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2814 Src1, AMDGPU::OpName::src1_modifiers);
2815
2816 CommutedMI->setDesc(get(CommutedOpcode));
2817 }
2818
2819 return CommutedMI;
2820}
2821
2822// This needs to be implemented because the source modifiers may be inserted
2823// between the true commutable operands, and the base
2824// TargetInstrInfo::commuteInstruction uses it.
2826 unsigned &SrcOpIdx0,
2827 unsigned &SrcOpIdx1) const {
2828 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2829}
2830
2832 unsigned &SrcOpIdx0,
2833 unsigned &SrcOpIdx1) const {
2834 if (!Desc.isCommutable())
2835 return false;
2836
2837 unsigned Opc = Desc.getOpcode();
2838 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2839 if (Src0Idx == -1)
2840 return false;
2841
2842 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2843 if (Src1Idx == -1)
2844 return false;
2845
2846 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2847}
2848
2850 int64_t BrOffset) const {
2851 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2852 // block is unanalyzable.
2853 assert(BranchOp != AMDGPU::S_SETPC_B64);
2854
2855 // Convert to dwords.
2856 BrOffset /= 4;
2857
2858 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2859 // from the next instruction.
2860 BrOffset -= 1;
2861
2862 return isIntN(BranchOffsetBits, BrOffset);
2863}
2864
2867 return MI.getOperand(0).getMBB();
2868}
2869
2871 for (const MachineInstr &MI : MBB->terminators()) {
2872 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
2873 MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2874 MI.getOpcode() == AMDGPU::SI_LOOP)
2875 return true;
2876 }
2877 return false;
2878}
2879
2881 MachineBasicBlock &DestBB,
2882 MachineBasicBlock &RestoreBB,
2883 const DebugLoc &DL, int64_t BrOffset,
2884 RegScavenger *RS) const {
2885 assert(RS && "RegScavenger required for long branching");
2886 assert(MBB.empty() &&
2887 "new block should be inserted for expanding unconditional branch");
2888 assert(MBB.pred_size() == 1);
2889 assert(RestoreBB.empty() &&
2890 "restore block should be inserted for restoring clobbered registers");
2891
2895
2896 // FIXME: Virtual register workaround for RegScavenger not working with empty
2897 // blocks.
2898 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2899
2900 auto I = MBB.end();
2901
2902 // We need to compute the offset relative to the instruction immediately after
2903 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2904 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2905
2906 auto &MCCtx = MF->getContext();
2907 MCSymbol *PostGetPCLabel =
2908 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2909 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2910
2911 MCSymbol *OffsetLo =
2912 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2913 MCSymbol *OffsetHi =
2914 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2915 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2916 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2917 .addReg(PCReg, 0, AMDGPU::sub0)
2918 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2919 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2920 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2921 .addReg(PCReg, 0, AMDGPU::sub1)
2922 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2923
2924 // Insert the indirect branch after the other terminator.
2925 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2926 .addReg(PCReg);
2927
2928 // If a spill is needed for the pc register pair, we need to insert a spill
2929 // restore block right before the destination block, and insert a short branch
2930 // into the old destination block's fallthrough predecessor.
2931 // e.g.:
2932 //
2933 // s_cbranch_scc0 skip_long_branch:
2934 //
2935 // long_branch_bb:
2936 // spill s[8:9]
2937 // s_getpc_b64 s[8:9]
2938 // s_add_u32 s8, s8, restore_bb
2939 // s_addc_u32 s9, s9, 0
2940 // s_setpc_b64 s[8:9]
2941 //
2942 // skip_long_branch:
2943 // foo;
2944 //
2945 // .....
2946 //
2947 // dest_bb_fallthrough_predecessor:
2948 // bar;
2949 // s_branch dest_bb
2950 //
2951 // restore_bb:
2952 // restore s[8:9]
2953 // fallthrough dest_bb
2954 ///
2955 // dest_bb:
2956 // buzz;
2957
2958 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2959 Register Scav;
2960
2961 // If we've previously reserved a register for long branches
2962 // avoid running the scavenger and just use those registers
2963 if (LongBranchReservedReg) {
2964 RS->enterBasicBlock(MBB);
2965 Scav = LongBranchReservedReg;
2966 } else {
2968 Scav = RS->scavengeRegisterBackwards(
2969 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2970 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2971 }
2972 if (Scav) {
2973 RS->setRegUsed(Scav);
2974 MRI.replaceRegWith(PCReg, Scav);
2975 MRI.clearVirtRegs();
2976 } else {
2977 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2978 // SGPR spill.
2979 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2980 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2981 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2982 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2983 MRI.clearVirtRegs();
2984 }
2985
2986 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2987 // Now, the distance could be defined.
2989 MCSymbolRefExpr::create(DestLabel, MCCtx),
2990 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2991 // Add offset assignments.
2992 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2993 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2994 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2995 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2996}
2997
2998unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2999 switch (Cond) {
3000 case SIInstrInfo::SCC_TRUE:
3001 return AMDGPU::S_CBRANCH_SCC1;
3002 case SIInstrInfo::SCC_FALSE:
3003 return AMDGPU::S_CBRANCH_SCC0;
3004 case SIInstrInfo::VCCNZ:
3005 return AMDGPU::S_CBRANCH_VCCNZ;
3006 case SIInstrInfo::VCCZ:
3007 return AMDGPU::S_CBRANCH_VCCZ;
3008 case SIInstrInfo::EXECNZ:
3009 return AMDGPU::S_CBRANCH_EXECNZ;
3010 case SIInstrInfo::EXECZ:
3011 return AMDGPU::S_CBRANCH_EXECZ;
3012 default:
3013 llvm_unreachable("invalid branch predicate");
3014 }
3015}
3016
3017SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3018 switch (Opcode) {
3019 case AMDGPU::S_CBRANCH_SCC0:
3020 return SCC_FALSE;
3021 case AMDGPU::S_CBRANCH_SCC1:
3022 return SCC_TRUE;
3023 case AMDGPU::S_CBRANCH_VCCNZ:
3024 return VCCNZ;
3025 case AMDGPU::S_CBRANCH_VCCZ:
3026 return VCCZ;
3027 case AMDGPU::S_CBRANCH_EXECNZ:
3028 return EXECNZ;
3029 case AMDGPU::S_CBRANCH_EXECZ:
3030 return EXECZ;
3031 default:
3032 return INVALID_BR;
3033 }
3034}
3035
3039 MachineBasicBlock *&FBB,
3041 bool AllowModify) const {
3042 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3043 // Unconditional Branch
3044 TBB = I->getOperand(0).getMBB();
3045 return false;
3046 }
3047
3048 MachineBasicBlock *CondBB = nullptr;
3049
3050 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
3051 CondBB = I->getOperand(1).getMBB();
3052 Cond.push_back(I->getOperand(0));
3053 } else {
3054 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3055 if (Pred == INVALID_BR)
3056 return true;
3057
3058 CondBB = I->getOperand(0).getMBB();
3059 Cond.push_back(MachineOperand::CreateImm(Pred));
3060 Cond.push_back(I->getOperand(1)); // Save the branch register.
3061 }
3062 ++I;
3063
3064 if (I == MBB.end()) {
3065 // Conditional branch followed by fall-through.
3066 TBB = CondBB;
3067 return false;
3068 }
3069
3070 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3071 TBB = CondBB;
3072 FBB = I->getOperand(0).getMBB();
3073 return false;
3074 }
3075
3076 return true;
3077}
3078
3080 MachineBasicBlock *&FBB,
3082 bool AllowModify) const {
3084 auto E = MBB.end();
3085 if (I == E)
3086 return false;
3087
3088 // Skip over the instructions that are artificially terminators for special
3089 // exec management.
3090 while (I != E && !I->isBranch() && !I->isReturn()) {
3091 switch (I->getOpcode()) {
3092 case AMDGPU::S_MOV_B64_term:
3093 case AMDGPU::S_XOR_B64_term:
3094 case AMDGPU::S_OR_B64_term:
3095 case AMDGPU::S_ANDN2_B64_term:
3096 case AMDGPU::S_AND_B64_term:
3097 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3098 case AMDGPU::S_MOV_B32_term:
3099 case AMDGPU::S_XOR_B32_term:
3100 case AMDGPU::S_OR_B32_term:
3101 case AMDGPU::S_ANDN2_B32_term:
3102 case AMDGPU::S_AND_B32_term:
3103 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3104 break;
3105 case AMDGPU::SI_IF:
3106 case AMDGPU::SI_ELSE:
3107 case AMDGPU::SI_KILL_I1_TERMINATOR:
3108 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3109 // FIXME: It's messy that these need to be considered here at all.
3110 return true;
3111 default:
3112 llvm_unreachable("unexpected non-branch terminator inst");
3113 }
3114
3115 ++I;
3116 }
3117
3118 if (I == E)
3119 return false;
3120
3121 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3122}
3123
3125 int *BytesRemoved) const {
3126 unsigned Count = 0;
3127 unsigned RemovedSize = 0;
3129 // Skip over artificial terminators when removing instructions.
3130 if (MI.isBranch() || MI.isReturn()) {
3131 RemovedSize += getInstSizeInBytes(MI);
3132 MI.eraseFromParent();
3133 ++Count;
3134 }
3135 }
3136
3137 if (BytesRemoved)
3138 *BytesRemoved = RemovedSize;
3139
3140 return Count;
3141}
3142
3143// Copy the flags onto the implicit condition register operand.
3145 const MachineOperand &OrigCond) {
3146 CondReg.setIsUndef(OrigCond.isUndef());
3147 CondReg.setIsKill(OrigCond.isKill());
3148}
3149
3152 MachineBasicBlock *FBB,
3154 const DebugLoc &DL,
3155 int *BytesAdded) const {
3156 if (!FBB && Cond.empty()) {
3157 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3158 .addMBB(TBB);
3159 if (BytesAdded)
3160 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3161 return 1;
3162 }
3163
3164 if(Cond.size() == 1 && Cond[0].isReg()) {
3165 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
3166 .add(Cond[0])
3167 .addMBB(TBB);
3168 return 1;
3169 }
3170
3171 assert(TBB && Cond[0].isImm());
3172
3173 unsigned Opcode
3174 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3175
3176 if (!FBB) {
3177 MachineInstr *CondBr =
3178 BuildMI(&MBB, DL, get(Opcode))
3179 .addMBB(TBB);
3180
3181 // Copy the flags onto the implicit condition register operand.
3182 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3183 fixImplicitOperands(*CondBr);
3184
3185 if (BytesAdded)
3186 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3187 return 1;
3188 }
3189
3190 assert(TBB && FBB);
3191
3192 MachineInstr *CondBr =
3193 BuildMI(&MBB, DL, get(Opcode))
3194 .addMBB(TBB);
3195 fixImplicitOperands(*CondBr);
3196 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3197 .addMBB(FBB);
3198
3199 MachineOperand &CondReg = CondBr->getOperand(1);
3200 CondReg.setIsUndef(Cond[1].isUndef());
3201 CondReg.setIsKill(Cond[1].isKill());
3202
3203 if (BytesAdded)
3204 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3205
3206 return 2;
3207}
3208
3211 if (Cond.size() != 2) {
3212 return true;
3213 }
3214
3215 if (Cond[0].isImm()) {
3216 Cond[0].setImm(-Cond[0].getImm());
3217 return false;
3218 }
3219
3220 return true;
3221}
3222
3225 Register DstReg, Register TrueReg,
3226 Register FalseReg, int &CondCycles,
3227 int &TrueCycles, int &FalseCycles) const {
3228 switch (Cond[0].getImm()) {
3229 case VCCNZ:
3230 case VCCZ: {
3232 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3233 if (MRI.getRegClass(FalseReg) != RC)
3234 return false;
3235
3236 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3237 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3238
3239 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3240 return RI.hasVGPRs(RC) && NumInsts <= 6;
3241 }
3242 case SCC_TRUE:
3243 case SCC_FALSE: {
3244 // FIXME: We could insert for VGPRs if we could replace the original compare
3245 // with a vector one.
3247 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3248 if (MRI.getRegClass(FalseReg) != RC)
3249 return false;
3250
3251 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3252
3253 // Multiples of 8 can do s_cselect_b64
3254 if (NumInsts % 2 == 0)
3255 NumInsts /= 2;
3256
3257 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3258 return RI.isSGPRClass(RC);
3259 }
3260 default:
3261 return false;
3262 }
3263}
3264
3268 Register TrueReg, Register FalseReg) const {
3269 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3270 if (Pred == VCCZ || Pred == SCC_FALSE) {
3271 Pred = static_cast<BranchPredicate>(-Pred);
3272 std::swap(TrueReg, FalseReg);
3273 }
3274
3276 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3277 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3278
3279 if (DstSize == 32) {
3281 if (Pred == SCC_TRUE) {
3282 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3283 .addReg(TrueReg)
3284 .addReg(FalseReg);
3285 } else {
3286 // Instruction's operands are backwards from what is expected.
3287 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3288 .addReg(FalseReg)
3289 .addReg(TrueReg);
3290 }
3291
3292 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3293 return;
3294 }
3295
3296 if (DstSize == 64 && Pred == SCC_TRUE) {
3298 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3299 .addReg(TrueReg)
3300 .addReg(FalseReg);
3301
3302 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3303 return;
3304 }
3305
3306 static const int16_t Sub0_15[] = {
3307 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3308 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3309 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3310 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3311 };
3312
3313 static const int16_t Sub0_15_64[] = {
3314 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3315 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3316 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3317 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3318 };
3319
3320 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3321 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3322 const int16_t *SubIndices = Sub0_15;
3323 int NElts = DstSize / 32;
3324
3325 // 64-bit select is only available for SALU.
3326 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3327 if (Pred == SCC_TRUE) {
3328 if (NElts % 2) {
3329 SelOp = AMDGPU::S_CSELECT_B32;
3330 EltRC = &AMDGPU::SGPR_32RegClass;
3331 } else {
3332 SelOp = AMDGPU::S_CSELECT_B64;
3333 EltRC = &AMDGPU::SGPR_64RegClass;
3334 SubIndices = Sub0_15_64;
3335 NElts /= 2;
3336 }
3337 }
3338
3340 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3341
3342 I = MIB->getIterator();
3343
3345 for (int Idx = 0; Idx != NElts; ++Idx) {
3346 Register DstElt = MRI.createVirtualRegister(EltRC);
3347 Regs.push_back(DstElt);
3348
3349 unsigned SubIdx = SubIndices[Idx];
3350
3352 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3353 Select =
3354 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3355 .addReg(FalseReg, 0, SubIdx)
3356 .addReg(TrueReg, 0, SubIdx);
3357 } else {
3358 Select =
3359 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3360 .addReg(TrueReg, 0, SubIdx)
3361 .addReg(FalseReg, 0, SubIdx);
3362 }
3363
3364 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3366
3367 MIB.addReg(DstElt)
3368 .addImm(SubIdx);
3369 }
3370}
3371
3373 switch (MI.getOpcode()) {
3374 case AMDGPU::V_MOV_B32_e32:
3375 case AMDGPU::V_MOV_B32_e64:
3376 case AMDGPU::V_MOV_B64_PSEUDO:
3377 case AMDGPU::V_MOV_B64_e32:
3378 case AMDGPU::V_MOV_B64_e64:
3379 case AMDGPU::S_MOV_B32:
3380 case AMDGPU::S_MOV_B64:
3381 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3382 case AMDGPU::COPY:
3383 case AMDGPU::WWM_COPY:
3384 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3385 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3386 case AMDGPU::V_ACCVGPR_MOV_B32:
3387 return true;
3388 default:
3389 return false;
3390 }
3391}
3392
3393static constexpr unsigned ModifierOpNames[] = {
3394 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3395 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3396 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3397
3399 unsigned Opc = MI.getOpcode();
3400 for (unsigned Name : reverse(ModifierOpNames)) {
3402 if (Idx >= 0)
3403 MI.removeOperand(Idx);
3404 }
3405}
3406
3408 Register Reg, MachineRegisterInfo *MRI) const {
3409 if (!MRI->hasOneNonDBGUse(Reg))
3410 return false;
3411
3412 switch (DefMI.getOpcode()) {
3413 default:
3414 return false;
3415 case AMDGPU::V_MOV_B64_e32:
3416 case AMDGPU::S_MOV_B64:
3417 case AMDGPU::V_MOV_B64_PSEUDO:
3418 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3419 case AMDGPU::V_MOV_B32_e32:
3420 case AMDGPU::S_MOV_B32:
3421 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3422 break;
3423 }
3424
3425 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3426 assert(ImmOp);
3427 // FIXME: We could handle FrameIndex values here.
3428 if (!ImmOp->isImm())
3429 return false;
3430
3431 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3432 int64_t Imm = ImmOp->getImm();
3433 switch (UseOp.getSubReg()) {
3434 default:
3435 return Imm;
3436 case AMDGPU::sub0:
3437 return Lo_32(Imm);
3438 case AMDGPU::sub1:
3439 return Hi_32(Imm);
3440 case AMDGPU::lo16:
3441 return APInt(16, Imm).getSExtValue();
3442 case AMDGPU::hi16:
3443 return APInt(32, Imm).ashr(16).getSExtValue();
3444 case AMDGPU::sub1_lo16:
3445 return APInt(16, Hi_32(Imm)).getSExtValue();
3446 case AMDGPU::sub1_hi16:
3447 return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();
3448 }
3449 };
3450
3451 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3452
3453 unsigned Opc = UseMI.getOpcode();
3454 if (Opc == AMDGPU::COPY) {
3455 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3456
3457 Register DstReg = UseMI.getOperand(0).getReg();
3458 unsigned OpSize = getOpSize(UseMI, 0);
3459 bool Is16Bit = OpSize == 2;
3460 bool Is64Bit = OpSize == 8;
3461 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3462 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3463 : AMDGPU::V_MOV_B32_e32
3464 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3465 : AMDGPU::S_MOV_B32;
3466 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
3467
3468 if (RI.isAGPR(*MRI, DstReg)) {
3469 if (Is64Bit || !isInlineConstant(Imm))
3470 return false;
3471 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3472 }
3473
3474 if (Is16Bit) {
3475 if (isVGPRCopy)
3476 return false; // Do not clobber vgpr_hi16
3477
3478 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3479 return false;
3480
3481 UseMI.getOperand(0).setSubReg(0);
3482 if (DstReg.isPhysical()) {
3483 DstReg = RI.get32BitRegister(DstReg);
3484 UseMI.getOperand(0).setReg(DstReg);
3485 }
3486 assert(UseMI.getOperand(1).getReg().isVirtual());
3487 }
3488
3489 const MCInstrDesc &NewMCID = get(NewOpc);
3490 if (DstReg.isPhysical() &&
3491 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3492 return false;
3493
3494 UseMI.setDesc(NewMCID);
3495 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3496 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3497 return true;
3498 }
3499
3500 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3501 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3502 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3503 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3504 Opc == AMDGPU::V_FMAC_F16_t16_e64) {
3505 // Don't fold if we are using source or output modifiers. The new VOP2
3506 // instructions don't have them.
3508 return false;
3509
3510 // If this is a free constant, there's no reason to do this.
3511 // TODO: We could fold this here instead of letting SIFoldOperands do it
3512 // later.
3513 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3514
3515 // Any src operand can be used for the legality check.
3516 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3517 return false;
3518
3519 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3520 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3521 bool IsFMA =
3522 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3523 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3524 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3525 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3526 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3527
3528 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3529 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3530 (Src1->isReg() && Src1->getReg() == Reg)) {
3531 MachineOperand *RegSrc =
3532 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3533 if (!RegSrc->isReg())
3534 return false;
3535 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3536 ST.getConstantBusLimit(Opc) < 2)
3537 return false;
3538
3539 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3540 return false;
3541
3542 // If src2 is also a literal constant then we have to choose which one to
3543 // fold. In general it is better to choose madak so that the other literal
3544 // can be materialized in an sgpr instead of a vgpr:
3545 // s_mov_b32 s0, literal
3546 // v_madak_f32 v0, s0, v0, literal
3547 // Instead of:
3548 // v_mov_b32 v1, literal
3549 // v_madmk_f32 v0, v0, literal, v1
3550 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3551 if (Def && Def->isMoveImmediate() &&
3552 !isInlineConstant(Def->getOperand(1)))
3553 return false;
3554
3555 unsigned NewOpc =
3556 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3557 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3558 : AMDGPU::V_FMAMK_F16)
3559 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3560 if (pseudoToMCOpcode(NewOpc) == -1)
3561 return false;
3562
3563 // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3564 // would also require restricting their register classes. For now
3565 // just bail out.
3566 if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
3567 return false;
3568
3569 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3570
3571 // FIXME: This would be a lot easier if we could return a new instruction
3572 // instead of having to modify in place.
3573
3574 Register SrcReg = RegSrc->getReg();
3575 unsigned SrcSubReg = RegSrc->getSubReg();
3576 Src0->setReg(SrcReg);
3577 Src0->setSubReg(SrcSubReg);
3578 Src0->setIsKill(RegSrc->isKill());
3579
3580 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3581 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3582 Opc == AMDGPU::V_FMAC_F16_e64)
3583 UseMI.untieRegOperand(
3584 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3585
3586 Src1->ChangeToImmediate(Imm);
3587
3589 UseMI.setDesc(get(NewOpc));
3590
3591 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3592 if (DeleteDef)
3593 DefMI.eraseFromParent();
3594
3595 return true;
3596 }
3597
3598 // Added part is the constant: Use v_madak_{f16, f32}.
3599 if (Src2->isReg() && Src2->getReg() == Reg) {
3600 if (ST.getConstantBusLimit(Opc) < 2) {
3601 // Not allowed to use constant bus for another operand.
3602 // We can however allow an inline immediate as src0.
3603 bool Src0Inlined = false;
3604 if (Src0->isReg()) {
3605 // Try to inline constant if possible.
3606 // If the Def moves immediate and the use is single
3607 // We are saving VGPR here.
3608 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3609 if (Def && Def->isMoveImmediate() &&
3610 isInlineConstant(Def->getOperand(1)) &&
3611 MRI->hasOneUse(Src0->getReg())) {
3612 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3613 Src0Inlined = true;
3614 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3615 RI.isSGPRReg(*MRI, Src0->getReg())) {
3616 return false;
3617 }
3618 // VGPR is okay as Src0 - fallthrough
3619 }
3620
3621 if (Src1->isReg() && !Src0Inlined) {
3622 // We have one slot for inlinable constant so far - try to fill it
3623 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3624 if (Def && Def->isMoveImmediate() &&
3625 isInlineConstant(Def->getOperand(1)) &&
3626 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3627 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3628 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3629 return false;
3630 // VGPR is okay as Src1 - fallthrough
3631 }
3632 }
3633
3634 unsigned NewOpc =
3635 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3636 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3637 : AMDGPU::V_FMAAK_F16)
3638 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3639 if (pseudoToMCOpcode(NewOpc) == -1)
3640 return false;
3641
3642 // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3643 // would also require restricting their register classes. For now
3644 // just bail out.
3645 if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
3646 return false;
3647
3648 // FIXME: This would be a lot easier if we could return a new instruction
3649 // instead of having to modify in place.
3650
3651 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3652 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3653 Opc == AMDGPU::V_FMAC_F16_e64)
3654 UseMI.untieRegOperand(
3655 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3656
3657 // ChangingToImmediate adds Src2 back to the instruction.
3658 Src2->ChangeToImmediate(getImmFor(*Src2));
3659
3660 // These come before src2.
3662 UseMI.setDesc(get(NewOpc));
3663 // It might happen that UseMI was commuted
3664 // and we now have SGPR as SRC1. If so 2 inlined
3665 // constant and SGPR are illegal.
3667
3668 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3669 if (DeleteDef)
3670 DefMI.eraseFromParent();
3671
3672 return true;
3673 }
3674 }
3675
3676 return false;
3677}
3678
3679static bool
3682 if (BaseOps1.size() != BaseOps2.size())
3683 return false;
3684 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3685 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3686 return false;
3687 }
3688 return true;
3689}
3690
3691static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3692 LocationSize WidthB, int OffsetB) {
3693 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3694 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3695 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3696 return LowWidth.hasValue() &&
3697 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3698}
3699
3700bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3701 const MachineInstr &MIb) const {
3702 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3703 int64_t Offset0, Offset1;
3704 LocationSize Dummy0 = 0, Dummy1 = 0;
3705 bool Offset0IsScalable, Offset1IsScalable;
3706 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3707 Dummy0, &RI) ||
3708 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3709 Dummy1, &RI))
3710 return false;
3711
3712 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3713 return false;
3714
3715 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3716 // FIXME: Handle ds_read2 / ds_write2.
3717 return false;
3718 }
3719 LocationSize Width0 = MIa.memoperands().front()->getSize();
3720 LocationSize Width1 = MIb.memoperands().front()->getSize();
3721 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3722}
3723
3725 const MachineInstr &MIb) const {
3726 assert(MIa.mayLoadOrStore() &&
3727 "MIa must load from or modify a memory location");
3728 assert(MIb.mayLoadOrStore() &&
3729 "MIb must load from or modify a memory location");
3730
3732 return false;
3733
3734 // XXX - Can we relax this between address spaces?
3735 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3736 return false;
3737
3738 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3739 return false;
3740
3741 // TODO: Should we check the address space from the MachineMemOperand? That
3742 // would allow us to distinguish objects we know don't alias based on the
3743 // underlying address space, even if it was lowered to a different one,
3744 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3745 // buffer.
3746 if (isDS(MIa)) {
3747 if (isDS(MIb))
3748 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3749
3750 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3751 }
3752
3753 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3754 if (isMUBUF(MIb) || isMTBUF(MIb))
3755 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3756
3757 if (isFLAT(MIb))
3758 return isFLATScratch(MIb);
3759
3760 return !isSMRD(MIb);
3761 }
3762
3763 if (isSMRD(MIa)) {
3764 if (isSMRD(MIb))
3765 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3766
3767 if (isFLAT(MIb))
3768 return isFLATScratch(MIb);
3769
3770 return !isMUBUF(MIb) && !isMTBUF(MIb);
3771 }
3772
3773 if (isFLAT(MIa)) {
3774 if (isFLAT(MIb)) {
3775 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3776 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3777 return true;
3778
3779 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3780 }
3781
3782 return false;
3783 }
3784
3785 return false;
3786}
3787
3789 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3790 if (Reg.isPhysical())
3791 return false;
3792 auto *Def = MRI.getUniqueVRegDef(Reg);
3793 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3794 Imm = Def->getOperand(1).getImm();
3795 if (DefMI)
3796 *DefMI = Def;
3797 return true;
3798 }
3799 return false;
3800}
3801
3802static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3803 MachineInstr **DefMI = nullptr) {
3804 if (!MO->isReg())
3805 return false;
3806 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3807 const MachineRegisterInfo &MRI = MF->getRegInfo();
3808 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3809}
3810
3812 MachineInstr &NewMI) {
3813 if (LV) {
3814 unsigned NumOps = MI.getNumOperands();
3815 for (unsigned I = 1; I < NumOps; ++I) {
3816 MachineOperand &Op = MI.getOperand(I);
3817 if (Op.isReg() && Op.isKill())
3818 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3819 }
3820 }
3821}
3822
3824 LiveVariables *LV,
3825 LiveIntervals *LIS) const {
3826 MachineBasicBlock &MBB = *MI.getParent();
3827 unsigned Opc = MI.getOpcode();
3828
3829 // Handle MFMA.
3830 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3831 if (NewMFMAOpc != -1) {
3833 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3834 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3835 MIB.add(MI.getOperand(I));
3836 updateLiveVariables(LV, MI, *MIB);
3837 if (LIS) {
3838 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3839 // SlotIndex of defs needs to be updated when converting to early-clobber
3840 MachineOperand &Def = MIB->getOperand(0);
3841 if (Def.isEarlyClobber() && Def.isReg() &&
3842 LIS->hasInterval(Def.getReg())) {
3843 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3844 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3845 auto &LI = LIS->getInterval(Def.getReg());
3846 auto UpdateDefIndex = [&](LiveRange &LR) {
3847 auto S = LR.find(OldIndex);
3848 if (S != LR.end() && S->start == OldIndex) {
3849 assert(S->valno && S->valno->def == OldIndex);
3850 S->start = NewIndex;
3851 S->valno->def = NewIndex;
3852 }
3853 };
3854 UpdateDefIndex(LI);
3855 for (auto &SR : LI.subranges())
3856 UpdateDefIndex(SR);
3857 }
3858 }
3859 return MIB;
3860 }
3861
3862 if (SIInstrInfo::isWMMA(MI)) {
3863 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3864 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3865 .setMIFlags(MI.getFlags());
3866 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3867 MIB->addOperand(MI.getOperand(I));
3868
3869 updateLiveVariables(LV, MI, *MIB);
3870 if (LIS)
3871 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3872
3873 return MIB;
3874 }
3875
3876 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3877 "V_FMAC_F16_t16_e32 is not supported and not expected to be present "
3878 "pre-RA");
3879
3880 // Handle MAC/FMAC.
3881 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3882 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3883 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3884 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3885 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3886 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3887 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3888 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3889 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3890 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3891 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3892 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3893 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3894 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3895 bool Src0Literal = false;
3896
3897 switch (Opc) {
3898 default:
3899 return nullptr;
3900 case AMDGPU::V_MAC_F16_e64:
3901 case AMDGPU::V_FMAC_F16_e64:
3902 case AMDGPU::V_FMAC_F16_t16_e64:
3903 case AMDGPU::V_MAC_F32_e64:
3904 case AMDGPU::V_MAC_LEGACY_F32_e64:
3905 case AMDGPU::V_FMAC_F32_e64:
3906 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3907 case AMDGPU::V_FMAC_F64_e64:
3908 break;
3909 case AMDGPU::V_MAC_F16_e32:
3910 case AMDGPU::V_FMAC_F16_e32:
3911 case AMDGPU::V_MAC_F32_e32:
3912 case AMDGPU::V_MAC_LEGACY_F32_e32:
3913 case AMDGPU::V_FMAC_F32_e32:
3914 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3915 case AMDGPU::V_FMAC_F64_e32: {
3916 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3917 AMDGPU::OpName::src0);
3918 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3919 if (!Src0->isReg() && !Src0->isImm())
3920 return nullptr;
3921
3922 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3923 Src0Literal = true;
3924
3925 break;
3926 }
3927 }
3928
3930 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3931 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3932 const MachineOperand *Src0Mods =
3933 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3934 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3935 const MachineOperand *Src1Mods =
3936 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3937 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3938 const MachineOperand *Src2Mods =
3939 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3940 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3941 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3942 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3943
3944 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3945 !IsLegacy &&
3946 // If we have an SGPR input, we will violate the constant bus restriction.
3947 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3948 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3950 const auto killDef = [&]() -> void {
3952 // The only user is the instruction which will be killed.
3953 Register DefReg = DefMI->getOperand(0).getReg();
3954 if (!MRI.hasOneNonDBGUse(DefReg))
3955 return;
3956 // We cannot just remove the DefMI here, calling pass will crash.
3957 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3958 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3960 if (LV)
3961 LV->getVarInfo(DefReg).AliveBlocks.clear();
3962 };
3963
3964 int64_t Imm;
3965 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3966 unsigned NewOpc =
3967 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3968 : AMDGPU::V_FMAAK_F16)
3969 : AMDGPU::V_FMAAK_F32)
3970 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3971 if (pseudoToMCOpcode(NewOpc) != -1) {
3972 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3973 .add(*Dst)
3974 .add(*Src0)
3975 .add(*Src1)
3976 .addImm(Imm);
3977 updateLiveVariables(LV, MI, *MIB);
3978 if (LIS)
3979 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3980 killDef();
3981 return MIB;
3982 }
3983 }
3984 unsigned NewOpc =
3985 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3986 : AMDGPU::V_FMAMK_F16)
3987 : AMDGPU::V_FMAMK_F32)
3988 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3989 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3990 if (pseudoToMCOpcode(NewOpc) != -1) {
3991 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3992 .add(*Dst)
3993 .add(*Src0)
3994 .addImm(Imm)
3995 .add(*Src2);
3996 updateLiveVariables(LV, MI, *MIB);
3997 if (LIS)
3998 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3999 killDef();
4000 return MIB;
4001 }
4002 }
4003 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4004 if (Src0Literal) {
4005 Imm = Src0->getImm();
4006 DefMI = nullptr;
4007 }
4008 if (pseudoToMCOpcode(NewOpc) != -1 &&
4010 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4011 Src1)) {
4012 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4013 .add(*Dst)
4014 .add(*Src1)
4015 .addImm(Imm)
4016 .add(*Src2);
4017 updateLiveVariables(LV, MI, *MIB);
4018 if (LIS)
4019 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4020 if (DefMI)
4021 killDef();
4022 return MIB;
4023 }
4024 }
4025 }
4026
4027 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4028 // if VOP3 does not allow a literal operand.
4029 if (Src0Literal && !ST.hasVOP3Literal())
4030 return nullptr;
4031
4032 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
4033 : IsF64 ? AMDGPU::V_FMA_F64_e64
4034 : IsLegacy
4035 ? AMDGPU::V_FMA_LEGACY_F32_e64
4036 : AMDGPU::V_FMA_F32_e64
4037 : IsF16 ? AMDGPU::V_MAD_F16_e64
4038 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
4039 : AMDGPU::V_MAD_F32_e64;
4040 if (pseudoToMCOpcode(NewOpc) == -1)
4041 return nullptr;
4042
4043 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4044 .add(*Dst)
4045 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4046 .add(*Src0)
4047 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4048 .add(*Src1)
4049 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4050 .add(*Src2)
4051 .addImm(Clamp ? Clamp->getImm() : 0)
4052 .addImm(Omod ? Omod->getImm() : 0);
4053 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4054 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4055 updateLiveVariables(LV, MI, *MIB);
4056 if (LIS)
4057 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4058 return MIB;
4059}
4060
4061// It's not generally safe to move VALU instructions across these since it will
4062// start using the register as a base index rather than directly.
4063// XXX - Why isn't hasSideEffects sufficient for these?
4065 switch (MI.getOpcode()) {
4066 case AMDGPU::S_SET_GPR_IDX_ON:
4067 case AMDGPU::S_SET_GPR_IDX_MODE:
4068 case AMDGPU::S_SET_GPR_IDX_OFF:
4069 return true;
4070 default:
4071 return false;
4072 }
4073}
4074
4076 const MachineBasicBlock *MBB,
4077 const MachineFunction &MF) const {
4078 // Skipping the check for SP writes in the base implementation. The reason it
4079 // was added was apparently due to compile time concerns.
4080 //
4081 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4082 // but is probably avoidable.
4083
4084 // Copied from base implementation.
4085 // Terminators and labels can't be scheduled around.
4086 if (MI.isTerminator() || MI.isPosition())
4087 return true;
4088
4089 // INLINEASM_BR can jump to another block
4090 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4091 return true;
4092
4093 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4094 return true;
4095
4096 // Target-independent instructions do not have an implicit-use of EXEC, even
4097 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4098 // boundaries prevents incorrect movements of such instructions.
4099 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4100 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4101 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4102 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4104}
4105
4107 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4108}
4109
4111 // Skip the full operand and register alias search modifiesRegister
4112 // does. There's only a handful of instructions that touch this, it's only an
4113 // implicit def, and doesn't alias any other registers.
4114 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4115}
4116
4118 unsigned Opcode = MI.getOpcode();
4119
4120 if (MI.mayStore() && isSMRD(MI))
4121 return true; // scalar store or atomic
4122
4123 // This will terminate the function when other lanes may need to continue.
4124 if (MI.isReturn())
4125 return true;
4126
4127 // These instructions cause shader I/O that may cause hardware lockups
4128 // when executed with an empty EXEC mask.
4129 //
4130 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4131 // EXEC = 0, but checking for that case here seems not worth it
4132 // given the typical code patterns.
4133 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4134 isEXP(Opcode) ||
4135 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
4136 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
4137 return true;
4138
4139 if (MI.isCall() || MI.isInlineAsm())
4140 return true; // conservative assumption
4141
4142 // A mode change is a scalar operation that influences vector instructions.
4144 return true;
4145
4146 // These are like SALU instructions in terms of effects, so it's questionable
4147 // whether we should return true for those.
4148 //
4149 // However, executing them with EXEC = 0 causes them to operate on undefined
4150 // data, which we avoid by returning true here.
4151 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4152 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4153 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4154 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4155 return true;
4156
4157 return false;
4158}
4159
4161 const MachineInstr &MI) const {
4162 if (MI.isMetaInstruction())
4163 return false;
4164
4165 // This won't read exec if this is an SGPR->SGPR copy.
4166 if (MI.isCopyLike()) {
4167 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4168 return true;
4169
4170 // Make sure this isn't copying exec as a normal operand
4171 return MI.readsRegister(AMDGPU::EXEC, &RI);
4172 }
4173
4174 // Make a conservative assumption about the callee.
4175 if (MI.isCall())
4176 return true;
4177
4178 // Be conservative with any unhandled generic opcodes.
4179 if (!isTargetSpecificOpcode(MI.getOpcode()))
4180 return true;
4181
4182 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4183}
4184
4185bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4186 switch (Imm.getBitWidth()) {
4187 case 1: // This likely will be a condition code mask.
4188 return true;
4189
4190 case 32:
4191 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4192 ST.hasInv2PiInlineImm());
4193 case 64:
4194 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4195 ST.hasInv2PiInlineImm());
4196 case 16:
4197 return ST.has16BitInsts() &&
4198 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4199 ST.hasInv2PiInlineImm());
4200 default:
4201 llvm_unreachable("invalid bitwidth");
4202 }
4203}
4204
4206 APInt IntImm = Imm.bitcastToAPInt();
4207 int64_t IntImmVal = IntImm.getSExtValue();
4208 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4209 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4210 default:
4211 llvm_unreachable("invalid fltSemantics");
4214 return isInlineConstant(IntImm);
4216 return ST.has16BitInsts() &&
4217 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4219 return ST.has16BitInsts() &&
4220 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4221 }
4222}
4223
4225 uint8_t OperandType) const {
4226 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4227 if (!MO.isImm())
4228 return false;
4229
4230 // MachineOperand provides no way to tell the true operand size, since it only
4231 // records a 64-bit value. We need to know the size to determine if a 32-bit
4232 // floating point immediate bit pattern is legal for an integer immediate. It
4233 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4234
4235 int64_t Imm = MO.getImm();
4236 switch (OperandType) {
4249 int32_t Trunc = static_cast<int32_t>(Imm);
4251 }
4258 ST.hasInv2PiInlineImm());
4262 // We would expect inline immediates to not be concerned with an integer/fp
4263 // distinction. However, in the case of 16-bit integer operations, the
4264 // "floating point" values appear to not work. It seems read the low 16-bits
4265 // of 32-bit immediates, which happens to always work for the integer
4266 // values.
4267 //
4268 // See llvm bugzilla 46302.
4269 //
4270 // TODO: Theoretically we could use op-sel to use the high bits of the
4271 // 32-bit FP values.
4289 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4290 // A few special case instructions have 16-bit operands on subtargets
4291 // where 16-bit instructions are not legal.
4292 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4293 // constants in these cases
4294 int16_t Trunc = static_cast<int16_t>(Imm);
4295 return ST.has16BitInsts() &&
4297 }
4298
4299 return false;
4300 }
4305 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4306 int16_t Trunc = static_cast<int16_t>(Imm);
4307 return ST.has16BitInsts() &&
4309 }
4310 return false;
4311 }
4314 return false;
4317 // Always embedded in the instruction for free.
4318 return true;
4328 // Just ignore anything else.
4329 return true;
4330 default:
4331 llvm_unreachable("invalid operand type");
4332 }
4333}
4334
4335static bool compareMachineOp(const MachineOperand &Op0,
4336 const MachineOperand &Op1) {
4337 if (Op0.getType() != Op1.getType())
4338 return false;
4339
4340 switch (Op0.getType()) {
4342 return Op0.getReg() == Op1.getReg();
4344 return Op0.getImm() == Op1.getImm();
4345 default:
4346 llvm_unreachable("Didn't expect to be comparing these operand types");
4347 }
4348}
4349
4351 const MachineOperand &MO) const {
4352 const MCInstrDesc &InstDesc = MI.getDesc();
4353 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4354
4355 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4356
4358 return true;
4359
4360 if (OpInfo.RegClass < 0)
4361 return false;
4362
4363 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4364 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4365 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4366 AMDGPU::OpName::src2))
4367 return false;
4368 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4369 }
4370
4371 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4372 return false;
4373
4374 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4375 return true;
4376
4377 return ST.hasVOP3Literal();
4378}
4379
4380bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4381 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4382 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4383 return false;
4384
4385 int Op32 = AMDGPU::getVOPe32(Opcode);
4386 if (Op32 == -1)
4387 return false;
4388
4389 return pseudoToMCOpcode(Op32) != -1;
4390}
4391
4392bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4393 // The src0_modifier operand is present on all instructions
4394 // that have modifiers.
4395
4396 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4397}
4398
4400 unsigned OpName) const {
4401 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4402 return Mods && Mods->getImm();
4403}
4404
4406 return any_of(ModifierOpNames,
4407 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4408}
4409
4411 const MachineRegisterInfo &MRI) const {
4412 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4413 // Can't shrink instruction with three operands.
4414 if (Src2) {
4415 switch (MI.getOpcode()) {
4416 default: return false;
4417
4418 case AMDGPU::V_ADDC_U32_e64:
4419 case AMDGPU::V_SUBB_U32_e64:
4420 case AMDGPU::V_SUBBREV_U32_e64: {
4421 const MachineOperand *Src1
4422 = getNamedOperand(MI, AMDGPU::OpName::src1);
4423 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4424 return false;
4425 // Additional verification is needed for sdst/src2.
4426 return true;
4427 }
4428 case AMDGPU::V_MAC_F16_e64:
4429 case AMDGPU::V_MAC_F32_e64:
4430 case AMDGPU::V_MAC_LEGACY_F32_e64:
4431 case AMDGPU::V_FMAC_F16_e64:
4432 case AMDGPU::V_FMAC_F16_t16_e64:
4433 case AMDGPU::V_FMAC_F32_e64:
4434 case AMDGPU::V_FMAC_F64_e64:
4435 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4436 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4437 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4438 return false;
4439 break;
4440
4441 case AMDGPU::V_CNDMASK_B32_e64:
4442 break;
4443 }
4444 }
4445
4446 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4447 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4448 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4449 return false;
4450
4451 // We don't need to check src0, all input types are legal, so just make sure
4452 // src0 isn't using any modifiers.
4453 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4454 return false;
4455
4456 // Can it be shrunk to a valid 32 bit opcode?
4457 if (!hasVALU32BitEncoding(MI.getOpcode()))
4458 return false;
4459
4460 // Check output modifiers
4461 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4462 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4463 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel);
4464}
4465
4466// Set VCC operand with all flags from \p Orig, except for setting it as
4467// implicit.
4469 const MachineOperand &Orig) {
4470
4471 for (MachineOperand &Use : MI.implicit_operands()) {
4472 if (Use.isUse() &&
4473 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4474 Use.setIsUndef(Orig.isUndef());
4475 Use.setIsKill(Orig.isKill());
4476 return;
4477 }
4478 }
4479}
4480
4482 unsigned Op32) const {
4483 MachineBasicBlock *MBB = MI.getParent();
4484 MachineInstrBuilder Inst32 =
4485 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
4486 .setMIFlags(MI.getFlags());
4487
4488 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4489 // For VOPC instructions, this is replaced by an implicit def of vcc.
4490 if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::vdst)) {
4491 // dst
4492 Inst32.add(MI.getOperand(0));
4493 } else if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::sdst)) {
4494 // VOPCX instructions won't be writing to an explicit dst, so this should
4495 // not fail for these instructions.
4496 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
4497 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
4498 "Unexpected case");
4499 }
4500
4501 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
4502
4503 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4504 if (Src1)
4505 Inst32.add(*Src1);
4506
4507 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4508
4509 if (Src2) {
4510 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
4511 if (Op32Src2Idx != -1) {
4512 Inst32.add(*Src2);
4513 } else {
4514 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4515 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4516 // of vcc was already added during the initial BuildMI, but we
4517 // 1) may need to change vcc to vcc_lo to preserve the original register
4518 // 2) have to preserve the original flags.
4519 fixImplicitOperands(*Inst32);
4520 copyFlagsToImplicitVCC(*Inst32, *Src2);
4521 }
4522 }
4523
4524 return Inst32;
4525}
4526
4528 const MachineOperand &MO,
4529 const MCOperandInfo &OpInfo) const {
4530 // Literal constants use the constant bus.
4531 if (!MO.isReg())
4532 return !isInlineConstant(MO, OpInfo);
4533
4534 if (!MO.isUse())
4535 return false;
4536
4537 if (MO.getReg().isVirtual())
4538 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4539
4540 // Null is free
4541 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4542 return false;
4543
4544 // SGPRs use the constant bus
4545 if (MO.isImplicit()) {
4546 return MO.getReg() == AMDGPU::M0 ||
4547 MO.getReg() == AMDGPU::VCC ||
4548 MO.getReg() == AMDGPU::VCC_LO;
4549 } else {
4550 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4551 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4552 }
4553}
4554
4556 for (const MachineOperand &MO : MI.implicit_operands()) {
4557 // We only care about reads.
4558 if (MO.isDef())
4559 continue;
4560
4561 switch (MO.getReg()) {
4562 case AMDGPU::VCC:
4563 case AMDGPU::VCC_LO:
4564 case AMDGPU::VCC_HI:
4565 case AMDGPU::M0:
4566 case AMDGPU::FLAT_SCR:
4567 return MO.getReg();
4568
4569 default:
4570 break;
4571 }
4572 }
4573
4574 return Register();
4575}
4576
4577static bool shouldReadExec(const MachineInstr &MI) {
4578 if (SIInstrInfo::isVALU(MI)) {
4579 switch (MI.getOpcode()) {
4580 case AMDGPU::V_READLANE_B32:
4581 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4582 case AMDGPU::V_WRITELANE_B32:
4583 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4584 return false;
4585 }
4586
4587 return true;
4588 }
4589
4590 if (MI.isPreISelOpcode() ||
4591 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4594 return false;
4595
4596 return true;
4597}
4598
4599static bool isSubRegOf(const SIRegisterInfo &TRI,
4600 const MachineOperand &SuperVec,
4601 const MachineOperand &SubReg) {
4602 if (SubReg.getReg().isPhysical())
4603 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4604
4605 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4606 SubReg.getReg() == SuperVec.getReg();
4607}
4608
4610 StringRef &ErrInfo) const {
4611 uint16_t Opcode = MI.getOpcode();
4612 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4613 return true;
4614
4615 const MachineFunction *MF = MI.getParent()->getParent();
4616 const MachineRegisterInfo &MRI = MF->getRegInfo();
4617
4618 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4619 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4620 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4621 int Src3Idx = -1;
4622 if (Src0Idx == -1) {
4623 // VOPD V_DUAL_* instructions use different operand names.
4624 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4625 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4626 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4627 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4628 }
4629
4630 // Make sure the number of operands is correct.
4631 const MCInstrDesc &Desc = get(Opcode);
4632 if (!Desc.isVariadic() &&
4633 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4634 ErrInfo = "Instruction has wrong number of operands.";
4635 return false;
4636 }
4637
4638 if (MI.isInlineAsm()) {
4639 // Verify register classes for inlineasm constraints.
4640 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4641 I != E; ++I) {
4642 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4643 if (!RC)
4644 continue;
4645
4646 const MachineOperand &Op = MI.getOperand(I);
4647 if (!Op.isReg())
4648 continue;
4649
4650 Register Reg = Op.getReg();
4651 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4652 ErrInfo = "inlineasm operand has incorrect register class.";
4653 return false;
4654 }
4655 }
4656
4657 return true;
4658 }
4659
4660 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4661 ErrInfo = "missing memory operand from image instruction.";
4662 return false;
4663 }
4664
4665 // Make sure the register classes are correct.
4666 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4667 const MachineOperand &MO = MI.getOperand(i);
4668 if (MO.isFPImm()) {
4669 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4670 "all fp values to integers.";
4671 return false;
4672 }
4673
4674 int RegClass = Desc.operands()[i].RegClass;
4675
4676 switch (Desc.operands()[i].OperandType) {
4678 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4679 ErrInfo = "Illegal immediate value for operand.";
4680 return false;
4681 }
4682 break;
4687 break;
4699 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4700 ErrInfo = "Illegal immediate value for operand.";
4701 return false;
4702 }
4703 break;
4704 }
4706 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4707 ErrInfo = "Expected inline constant for operand.";
4708 return false;
4709 }
4710 break;
4713 // Check if this operand is an immediate.
4714 // FrameIndex operands will be replaced by immediates, so they are
4715 // allowed.
4716 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4717 ErrInfo = "Expected immediate, but got non-immediate";
4718 return false;
4719 }
4720 [[fallthrough]];
4721 default:
4722 continue;
4723 }
4724
4725 if (!MO.isReg())
4726 continue;
4727 Register Reg = MO.getReg();
4728 if (!Reg)
4729 continue;
4730
4731 // FIXME: Ideally we would have separate instruction definitions with the
4732 // aligned register constraint.
4733 // FIXME: We do not verify inline asm operands, but custom inline asm
4734 // verification is broken anyway
4735 if (ST.needsAlignedVGPRs()) {
4736 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4737 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4738 const TargetRegisterClass *SubRC =
4739 RI.getSubRegisterClass(RC, MO.getSubReg());
4740 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4741 if (RC)
4742 RC = SubRC;
4743 }
4744
4745 // Check that this is the aligned version of the class.
4746 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4747 ErrInfo = "Subtarget requires even aligned vector registers";
4748 return false;
4749 }
4750 }
4751
4752 if (RegClass != -1) {
4753 if (Reg.isVirtual())
4754 continue;
4755
4756 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4757 if (!RC->contains(Reg)) {
4758 ErrInfo = "Operand has incorrect register class.";
4759 return false;
4760 }
4761 }
4762 }
4763
4764 // Verify SDWA
4765 if (isSDWA(MI)) {
4766 if (!ST.hasSDWA()) {
4767 ErrInfo = "SDWA is not supported on this target";
4768 return false;
4769 }
4770
4771 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4772
4773 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4774 if (OpIdx == -1)
4775 continue;
4776 const MachineOperand &MO = MI.getOperand(OpIdx);
4777
4778 if (!ST.hasSDWAScalar()) {
4779 // Only VGPRS on VI
4780 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4781 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4782 return false;
4783 }
4784 } else {
4785 // No immediates on GFX9
4786 if (!MO.isReg()) {
4787 ErrInfo =
4788 "Only reg allowed as operands in SDWA instructions on GFX9+";
4789 return false;
4790 }
4791 }
4792 }
4793
4794 if (!ST.hasSDWAOmod()) {
4795 // No omod allowed on VI
4796 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4797 if (OMod != nullptr &&
4798 (!OMod->isImm() || OMod->getImm() != 0)) {
4799 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4800 return false;
4801 }
4802 }
4803
4804 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4805 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4806 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4807 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4808 const MachineOperand *Src0ModsMO =
4809 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4810 unsigned Mods = Src0ModsMO->getImm();
4811 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4812 Mods & SISrcMods::SEXT) {
4813 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4814 return false;
4815 }
4816 }
4817
4818 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4819 if (isVOPC(BasicOpcode)) {
4820 if (!ST.hasSDWASdst() && DstIdx != -1) {
4821 // Only vcc allowed as dst on VI for VOPC
4822 const MachineOperand &Dst = MI.getOperand(DstIdx);
4823 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4824 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4825 return false;
4826 }
4827 } else if (!ST.hasSDWAOutModsVOPC()) {
4828 // No clamp allowed on GFX9 for VOPC
4829 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4830 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4831 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4832 return false;
4833 }
4834
4835 // No omod allowed on GFX9 for VOPC
4836 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4837 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4838 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4839 return false;
4840 }
4841 }
4842 }
4843
4844 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4845 if (DstUnused && DstUnused->isImm() &&
4846 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4847 const MachineOperand &Dst = MI.getOperand(DstIdx);
4848 if (!Dst.isReg() || !Dst.isTied()) {
4849 ErrInfo = "Dst register should have tied register";
4850 return false;
4851 }
4852
4853 const MachineOperand &TiedMO =
4854 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4855 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4856 ErrInfo =
4857 "Dst register should be tied to implicit use of preserved register";
4858 return false;
4859 } else if (TiedMO.getReg().isPhysical() &&
4860 Dst.getReg() != TiedMO.getReg()) {
4861 ErrInfo = "Dst register should use same physical register as preserved";
4862 return false;
4863 }
4864 }
4865 }
4866
4867 // Verify MIMG / VIMAGE / VSAMPLE
4868 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4869 // Ensure that the return type used is large enough for all the options
4870 // being used TFE/LWE require an extra result register.
4871 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4872 if (DMask) {
4873 uint64_t DMaskImm = DMask->getImm();
4874 uint32_t RegCount =
4875 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
4876 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4877 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4878 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4879
4880 // Adjust for packed 16 bit values
4881 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4882 RegCount = divideCeil(RegCount, 2);
4883
4884 // Adjust if using LWE or TFE
4885 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4886 RegCount += 1;
4887
4888 const uint32_t DstIdx =
4889 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4890 const MachineOperand &Dst = MI.getOperand(DstIdx);
4891 if (Dst.isReg()) {
4892 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4893 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4894 if (RegCount > DstSize) {
4895 ErrInfo = "Image instruction returns too many registers for dst "
4896 "register class";
4897 return false;
4898 }
4899 }
4900 }
4901 }
4902
4903 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4904 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
4905 unsigned ConstantBusCount = 0;
4906 bool UsesLiteral = false;
4907 const MachineOperand *LiteralVal = nullptr;
4908
4909 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
4910 if (ImmIdx != -1) {
4911 ++ConstantBusCount;
4912 UsesLiteral = true;
4913 LiteralVal = &MI.getOperand(ImmIdx);
4914 }
4915
4916 SmallVector<Register, 2> SGPRsUsed;
4917 Register SGPRUsed;
4918
4919 // Only look at the true operands. Only a real operand can use the constant
4920 // bus, and we don't want to check pseudo-operands like the source modifier
4921 // flags.
4922 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
4923 if (OpIdx == -1)
4924 continue;
4925 const MachineOperand &MO = MI.getOperand(OpIdx);
4926 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4927 if (MO.isReg()) {
4928 SGPRUsed = MO.getReg();
4929 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
4930 ++ConstantBusCount;
4931 SGPRsUsed.push_back(SGPRUsed);
4932 }
4933 } else {
4934 if (!UsesLiteral) {
4935 ++ConstantBusCount;
4936 UsesLiteral = true;
4937 LiteralVal = &MO;
4938 } else if (!MO.isIdenticalTo(*LiteralVal)) {
4939 assert(isVOP2(MI) || isVOP3(MI));
4940 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
4941 return false;
4942 }
4943 }
4944 }
4945 }
4946
4947 SGPRUsed = findImplicitSGPRRead(MI);
4948 if (SGPRUsed) {
4949 // Implicit uses may safely overlap true operands
4950 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4951 return !RI.regsOverlap(SGPRUsed, SGPR);
4952 })) {
4953 ++ConstantBusCount;
4954 SGPRsUsed.push_back(SGPRUsed);
4955 }
4956 }
4957
4958 // v_writelane_b32 is an exception from constant bus restriction:
4959 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4960 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4961 Opcode != AMDGPU::V_WRITELANE_B32) {
4962 ErrInfo = "VOP* instruction violates constant bus restriction";
4963 return false;
4964 }
4965
4966 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4967 ErrInfo = "VOP3 instruction uses literal";
4968 return false;
4969 }
4970 }
4971
4972 // Special case for writelane - this can break the multiple constant bus rule,
4973 // but still can't use more than one SGPR register
4974 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4975 unsigned SGPRCount = 0;
4976 Register SGPRUsed;
4977
4978 for (int OpIdx : {Src0Idx, Src1Idx}) {
4979 if (OpIdx == -1)
4980 break;
4981
4982 const MachineOperand &MO = MI.getOperand(OpIdx);
4983
4984 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4985 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4986 if (MO.getReg() != SGPRUsed)
4987 ++SGPRCount;
4988 SGPRUsed = MO.getReg();
4989 }
4990 }
4991 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
4992 ErrInfo = "WRITELANE instruction violates constant bus restriction";
4993 return false;
4994 }
4995 }
4996 }
4997
4998 // Verify misc. restrictions on specific instructions.
4999 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5000 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5001 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5002 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5003 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5004 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5005 if (!compareMachineOp(Src0, Src1) &&
5006 !compareMachineOp(Src0, Src2)) {
5007 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5008 return false;
5009 }
5010 }
5011 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5012 SISrcMods::ABS) ||
5013 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5014 SISrcMods::ABS) ||
5015 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5016 SISrcMods::ABS)) {
5017 ErrInfo = "ABS not allowed in VOP3B instructions";
5018 return false;
5019 }
5020 }
5021
5022 if (isSOP2(MI) || isSOPC(MI)) {
5023 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5024 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5025
5026 if (!Src0.isReg() && !Src1.isReg() &&
5027 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5028 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5029 !Src0.isIdenticalTo(Src1)) {
5030 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5031 return false;
5032 }
5033 }
5034
5035 if (isSOPK(MI)) {
5036 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5037 if (Desc.isBranch()) {
5038 if (!Op->isMBB()) {
5039 ErrInfo = "invalid branch target for SOPK instruction";
5040 return false;
5041 }
5042 } else {
5043 uint64_t Imm = Op->getImm();
5044 if (sopkIsZext(Opcode)) {
5045 if (!isUInt<16>(Imm)) {
5046 ErrInfo = "invalid immediate for SOPK instruction";
5047 return false;
5048 }
5049 } else {
5050 if (!isInt<16>(Imm)) {
5051 ErrInfo = "invalid immediate for SOPK instruction";
5052 return false;
5053 }
5054 }
5055 }
5056 }
5057
5058 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5059 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5060 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5061 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5062 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5063 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5064
5065 const unsigned StaticNumOps =
5066 Desc.getNumOperands() + Desc.implicit_uses().size();
5067 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5068
5069 // Allow additional implicit operands. This allows a fixup done by the post
5070 // RA scheduler where the main implicit operand is killed and implicit-defs
5071 // are added for sub-registers that remain live after this instruction.
5072 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5073 ErrInfo = "missing implicit register operands";
5074 return false;
5075 }
5076
5077 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5078 if (IsDst) {
5079 if (!Dst->isUse()) {
5080 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5081 return false;
5082 }
5083
5084 unsigned UseOpIdx;
5085 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5086 UseOpIdx != StaticNumOps + 1) {
5087 ErrInfo = "movrel implicit operands should be tied";
5088 return false;
5089 }
5090 }
5091
5092 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5093 const MachineOperand &ImpUse
5094 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5095 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5096 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5097 ErrInfo = "src0 should be subreg of implicit vector use";
5098 return false;
5099 }
5100 }
5101
5102 // Make sure we aren't losing exec uses in the td files. This mostly requires
5103 // being careful when using let Uses to try to add other use registers.
5104 if (shouldReadExec(MI)) {
5105 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5106 ErrInfo = "VALU instruction does not implicitly read exec mask";
5107 return false;
5108 }
5109 }
5110
5111 if (isSMRD(MI)) {
5112 if (MI.mayStore() &&
5114 // The register offset form of scalar stores may only use m0 as the
5115 // soffset register.
5116 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5117 if (Soff && Soff->getReg() != AMDGPU::M0) {
5118 ErrInfo = "scalar stores must use m0 as offset register";
5119 return false;
5120 }
5121 }
5122 }
5123
5124 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5125 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5126 if (Offset->getImm() != 0) {
5127 ErrInfo = "subtarget does not support offsets in flat instructions";
5128 return false;
5129 }
5130 }
5131
5132 if (isDS(MI) && !ST.hasGDS()) {
5133 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5134 if (GDSOp && GDSOp->getImm() != 0) {
5135 ErrInfo = "GDS is not supported on this subtarget";
5136 return false;
5137 }
5138 }
5139
5140 if (isImage(MI)) {
5141 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5142 if (DimOp) {
5143 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5144 AMDGPU::OpName::vaddr0);
5145 int RSrcOpName =
5146 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5147 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5148 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5149 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5151 const AMDGPU::MIMGDimInfo *Dim =
5153
5154 if (!Dim) {
5155 ErrInfo = "dim is out of range";
5156 return false;
5157 }
5158
5159 bool IsA16 = false;
5160 if (ST.hasR128A16()) {
5161 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5162 IsA16 = R128A16->getImm() != 0;
5163 } else if (ST.hasA16()) {
5164 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5165 IsA16 = A16->getImm() != 0;
5166 }
5167
5168 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5169
5170 unsigned AddrWords =
5171 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5172
5173 unsigned VAddrWords;
5174 if (IsNSA) {
5175 VAddrWords = RsrcIdx - VAddr0Idx;
5176 if (ST.hasPartialNSAEncoding() &&
5177 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5178 unsigned LastVAddrIdx = RsrcIdx - 1;
5179 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5180 }
5181 } else {
5182 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5183 if (AddrWords > 12)
5184 AddrWords = 16;
5185 }
5186
5187 if (VAddrWords != AddrWords) {
5188 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5189 << " but got " << VAddrWords << "\n");
5190 ErrInfo = "bad vaddr size";
5191 return false;
5192 }
5193 }
5194 }
5195
5196 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5197 if (DppCt) {
5198 using namespace AMDGPU::DPP;
5199
5200 unsigned DC = DppCt->getImm();
5201 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5202 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5203 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5204 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5205 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5206 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5207 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5208 ErrInfo = "Invalid dpp_ctrl value";
5209 return false;
5210 }
5211 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5213 ErrInfo = "Invalid dpp_ctrl value: "
5214 "wavefront shifts are not supported on GFX10+";
5215 return false;
5216 }
5217 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5219 ErrInfo = "Invalid dpp_ctrl value: "
5220 "broadcasts are not supported on GFX10+";
5221 return false;
5222 }
5223 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5225 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5226 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5227 !ST.hasGFX90AInsts()) {
5228 ErrInfo = "Invalid dpp_ctrl value: "
5229 "row_newbroadcast/row_share is not supported before "
5230 "GFX90A/GFX10";
5231 return false;
5232 } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5233 ErrInfo = "Invalid dpp_ctrl value: "
5234 "row_share and row_xmask are not supported before GFX10";
5235 return false;
5236 }
5237 }
5238
5239 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5241 ErrInfo = "Invalid dpp_ctrl value: "
5242 "DP ALU dpp only support row_newbcast";
5243 return false;
5244 }
5245 }
5246
5247 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5248 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5249 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5250 : AMDGPU::OpName::vdata;
5251 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5252 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5253 if (Data && !Data->isReg())
5254 Data = nullptr;
5255
5256 if (ST.hasGFX90AInsts()) {
5257 if (Dst && Data &&
5258 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5259 ErrInfo = "Invalid register class: "
5260 "vdata and vdst should be both VGPR or AGPR";
5261 return false;
5262 }
5263 if (Data && Data2 &&
5264 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5265 ErrInfo = "Invalid register class: "
5266 "both data operands should be VGPR or AGPR";
5267 return false;
5268 }
5269 } else {
5270 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5271 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5272 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5273 ErrInfo = "Invalid register class: "
5274 "agpr loads and stores not supported on this GPU";
5275 return false;
5276 }
5277 }
5278 }
5279
5280 if (ST.needsAlignedVGPRs()) {
5281 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5283 if (!Op)
5284 return true;
5285 Register Reg = Op->getReg();
5286 if (Reg.isPhysical())
5287 return !(RI.getHWRegIndex(Reg) & 1);
5288 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5289 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5290 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5291 };
5292
5293 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5294 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5295 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5296
5297 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5298 ErrInfo = "Subtarget requires even aligned vector registers "
5299 "for DS_GWS instructions";
5300 return false;
5301 }
5302 }
5303
5304 if (isMIMG(MI)) {
5305 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5306 ErrInfo = "Subtarget requires even aligned vector registers "
5307 "for vaddr operand of image instructions";
5308 return false;
5309 }
5310 }
5311 }
5312
5313 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5314 !ST.hasGFX90AInsts()) {
5315 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5316 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5317 ErrInfo = "Invalid register class: "
5318 "v_accvgpr_write with an SGPR is not supported on this GPU";
5319 return false;
5320 }
5321 }
5322
5323 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5324 const MachineOperand &SrcOp = MI.getOperand(1);
5325 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5326 ErrInfo = "pseudo expects only physical SGPRs";
5327 return false;
5328 }
5329 }
5330
5331 return true;
5332}
5333
5334// It is more readable to list mapped opcodes on the same line.
5335// clang-format off
5336
5338 switch (MI.getOpcode()) {
5339 default: return AMDGPU::INSTRUCTION_LIST_END;
5340 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5341 case AMDGPU::COPY: return AMDGPU::COPY;
5342 case AMDGPU::PHI: return AMDGPU::PHI;
5343 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5344 case AMDGPU::WQM: return AMDGPU::WQM;
5345 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5346 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5347 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5348 case AMDGPU::S_MOV_B32: {
5349 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5350 return MI.getOperand(1).isReg() ||
5351 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5352 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5353 }
5354 case AMDGPU::S_ADD_I32:
5355 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5356 case AMDGPU::S_ADDC_U32:
5357 return AMDGPU::V_ADDC_U32_e32;
5358 case AMDGPU::S_SUB_I32:
5359 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5360 // FIXME: These are not consistently handled, and selected when the carry is
5361 // used.
5362 case AMDGPU::S_ADD_U32:
5363 return AMDGPU::V_ADD_CO_U32_e32;
5364 case AMDGPU::S_SUB_U32:
5365 return AMDGPU::V_SUB_CO_U32_e32;
5366 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5367 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5368 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5369 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5370 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5371 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5372 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5373 case AMDGPU::S_XNOR_B32:
5374 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5375 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5376 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5377 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5378 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5379 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5380 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5381 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5382 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5383 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5384 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5385 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5386 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5387 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5388 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5389 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5390 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5391 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5392 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5393 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5394 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5395 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5396 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5397 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5398 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5399 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5400 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5401 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5402 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5403 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5404 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5405 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5406 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5407 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5408 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5409 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5410 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5411 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5412 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5413 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5414 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5415 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5416 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5417 case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5418 case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5419 case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64;
5420 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5421 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5422 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5423 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5424 case AMDGPU::S_CEIL_F16:
5425 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5426 : AMDGPU::V_CEIL_F16_fake16_e64;
5427 case AMDGPU::S_FLOOR_F16:
5428 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5429 : AMDGPU::V_FLOOR_F16_fake16_e64;
5430 case AMDGPU::S_TRUNC_F16:
5431 return AMDGPU::V_TRUNC_F16_fake16_e64;
5432 case AMDGPU::S_RNDNE_F16:
5433 return AMDGPU::V_RNDNE_F16_fake16_e64;
5434 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5435 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5436 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5437 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5438 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5439 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5440 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5441 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5442 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5443 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5444 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5445 case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5446 case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5447 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5448 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5449 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5450 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
5451 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5452 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5453 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5454 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5455 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5456 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5457 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5458 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5459 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5460 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5461 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5462 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5463 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5464 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5465 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5466 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5467 case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64;
5468 case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64;
5469 case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64;
5470 case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64;
5471 case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64;
5472 case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64;
5473 case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64;
5474 case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64;
5475 case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64;
5476 case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64;
5477 case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64;
5478 case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
5479 case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
5480 case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
5481 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5482 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5483 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5484 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5485 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5486 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5487 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5488 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5489 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5490 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5491 }
5493 "Unexpected scalar opcode without corresponding vector one!");
5494}
5495
5496// clang-format on
5497
5501 const DebugLoc &DL, Register Reg,
5502 bool IsSCCLive,
5503 SlotIndexes *Indexes) const {
5504 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5505 const SIInstrInfo *TII = ST.getInstrInfo();
5506 bool IsWave32 = ST.isWave32();
5507 if (IsSCCLive) {
5508 // Insert two move instructions, one to save the original value of EXEC and
5509 // the other to turn on all bits in EXEC. This is required as we can't use
5510 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5511 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5512 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5513 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5514 .addReg(Exec, RegState::Kill);
5515 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5516 if (Indexes) {
5517 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5518 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5519 }
5520 } else {
5521 const unsigned OrSaveExec =
5522 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5523 auto SaveExec =
5524 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5525 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5526 if (Indexes)
5527 Indexes->insertMachineInstrInMaps(*SaveExec);
5528 }
5529}
5530
5533 const DebugLoc &DL, Register Reg,
5534 SlotIndexes *Indexes) const {
5535 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5536 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5537 auto ExecRestoreMI =
5538 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5539 if (Indexes)
5540 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5541}
5542
5543static const TargetRegisterClass *
5545 const MachineRegisterInfo &MRI,
5546 const MCInstrDesc &TID, unsigned RCID,
5547 bool IsAllocatable) {
5548 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5549 (((TID.mayLoad() || TID.mayStore()) &&
5550 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5552 switch (RCID) {
5553 case AMDGPU::AV_32RegClassID:
5554 RCID = AMDGPU::VGPR_32RegClassID;
5555 break;
5556 case AMDGPU::AV_64RegClassID:
5557 RCID = AMDGPU::VReg_64RegClassID;
5558 break;
5559 case AMDGPU::AV_96RegClassID:
5560 RCID = AMDGPU::VReg_96RegClassID;
5561 break;
5562 case AMDGPU::AV_128RegClassID:
5563 RCID = AMDGPU::VReg_128RegClassID;
5564 break;
5565 case AMDGPU::AV_160RegClassID:
5566 RCID = AMDGPU::VReg_160RegClassID;
5567 break;
5568 case AMDGPU::AV_512RegClassID:
5569 RCID = AMDGPU::VReg_512RegClassID;
5570 break;
5571 default:
5572 break;
5573 }
5574 }
5575
5576 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5577}
5578
5580 unsigned OpNum, const TargetRegisterInfo *TRI,
5581 const MachineFunction &MF)
5582 const {
5583 if (OpNum >= TID.getNumOperands())
5584 return nullptr;
5585 auto RegClass = TID.operands()[OpNum].RegClass;
5586 bool IsAllocatable = false;
5588 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5589 // with two data operands. Request register class constrained to VGPR only
5590 // of both operands present as Machine Copy Propagation can not check this
5591 // constraint and possibly other passes too.
5592 //
5593 // The check is limited to FLAT and DS because atomics in non-flat encoding
5594 // have their vdst and vdata tied to be the same register.
5595 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5596 AMDGPU::OpName::vdst);
5597 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5598 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5599 : AMDGPU::OpName::vdata);
5600 if (DataIdx != -1) {
5601 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5602 TID.Opcode, AMDGPU::OpName::data1);
5603 }
5604 }
5605 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5606 IsAllocatable);
5607}
5608
5610 unsigned OpNo) const {
5611 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5612 const MCInstrDesc &Desc = get(MI.getOpcode());
5613 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5614 Desc.operands()[OpNo].RegClass == -1) {
5615 Register Reg = MI.getOperand(OpNo).getReg();
5616
5617 if (Reg.isVirtual())
5618 return MRI.getRegClass(Reg);
5619 return RI.getPhysRegBaseClass(Reg);
5620 }
5621
5622 unsigned RCID = Desc.operands()[OpNo].RegClass;
5623 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5624}
5625
5628 MachineBasicBlock *MBB = MI.getParent();
5629 MachineOperand &MO = MI.getOperand(OpIdx);
5631 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5632 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5633 unsigned Size = RI.getRegSizeInBits(*RC);
5634 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
5635 if (MO.isReg())
5636 Opcode = AMDGPU::COPY;
5637 else if (RI.isSGPRClass(RC))
5638 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5639
5640 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5641 Register Reg = MRI.createVirtualRegister(VRC);
5643 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5644 MO.ChangeToRegister(Reg, false);
5645}
5646
5649 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5650 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5651 MachineBasicBlock *MBB = MI->getParent();
5652 DebugLoc DL = MI->getDebugLoc();
5653 Register SubReg = MRI.createVirtualRegister(SubRC);
5654
5655 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
5656 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5657 .addReg(SuperReg.getReg(), 0, SubIdx);
5658 return SubReg;
5659 }
5660
5661 // Just in case the super register is itself a sub-register, copy it to a new
5662 // value so we don't need to worry about merging its subreg index with the
5663 // SubIdx passed to this function. The register coalescer should be able to
5664 // eliminate this extra copy.
5665 Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
5666
5667 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
5668 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
5669
5670 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5671 .addReg(NewSuperReg, 0, SubIdx);
5672
5673 return SubReg;
5674}
5675
5678 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5679 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5680 if (Op.isImm()) {
5681 if (SubIdx == AMDGPU::sub0)
5682 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5683 if (SubIdx == AMDGPU::sub1)
5684 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5685
5686 llvm_unreachable("Unhandled register index for immediate");
5687 }
5688
5689 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5690 SubIdx, SubRC);
5691 return MachineOperand::CreateReg(SubReg, false);
5692}
5693
5694// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5695void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5696 assert(Inst.getNumExplicitOperands() == 3);
5697 MachineOperand Op1 = Inst.getOperand(1);
5698 Inst.removeOperand(1);
5699 Inst.addOperand(Op1);
5700}
5701
5703 const MCOperandInfo &OpInfo,
5704 const MachineOperand &MO) const {
5705 if (!MO.isReg())
5706 return false;
5707
5708 Register Reg = MO.getReg();
5709
5710 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5711 if (Reg.isPhysical())
5712 return DRC->contains(Reg);
5713
5714 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5715
5716 if (MO.getSubReg()) {
5717 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5718 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5719 if (!SuperRC)
5720 return false;
5721
5722 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5723 if (!DRC)
5724 return false;
5725 }
5726 return RC->hasSuperClassEq(DRC);
5727}
5728
5730 const MCOperandInfo &OpInfo,
5731 const MachineOperand &MO) const {
5732 if (MO.isReg())
5733 return isLegalRegOperand(MRI, OpInfo, MO);
5734
5735 // Handle non-register types that are treated like immediates.
5736 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5737 return true;
5738}
5739
5740bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5741 const MachineOperand *MO) const {
5742 const MachineFunction &MF = *MI.getParent()->getParent();
5743 const MachineRegisterInfo &MRI = MF.getRegInfo();
5744 const MCInstrDesc &InstDesc = MI.getDesc();
5745 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5746 const TargetRegisterClass *DefinedRC =
5747 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5748 if (!MO)
5749 MO = &MI.getOperand(OpIdx);
5750
5751 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5752 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5753 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5754 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5755 return false;
5756
5758 if (MO->isReg())
5759 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5760
5761 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5762 if (i == OpIdx)
5763 continue;
5764 const MachineOperand &Op = MI.getOperand(i);
5765 if (Op.isReg()) {
5766 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5767 if (!SGPRsUsed.count(SGPR) &&
5768 // FIXME: This can access off the end of the operands() array.
5769 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5770 if (--ConstantBusLimit <= 0)
5771 return false;
5772 SGPRsUsed.insert(SGPR);
5773 }
5774 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5775 !isInlineConstant(Op, InstDesc.operands()[i])) {
5776 if (!LiteralLimit--)
5777 return false;
5778 if (--ConstantBusLimit <= 0)
5779 return false;
5780 }
5781 }
5782 }
5783
5784 if (MO->isReg()) {
5785 if (!DefinedRC)
5786 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5787 if (!isLegalRegOperand(MRI, OpInfo, *MO))
5788 return false;
5789 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5790 if (IsAGPR && !ST.hasMAIInsts())
5791 return false;
5792 unsigned Opc = MI.getOpcode();
5793 if (IsAGPR &&
5794 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5795 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5796 return false;
5797 // Atomics should have both vdst and vdata either vgpr or agpr.
5798 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5799 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
5800 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5801 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5802 MI.getOperand(DataIdx).isReg() &&
5803 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5804 return false;
5805 if ((int)OpIdx == DataIdx) {
5806 if (VDstIdx != -1 &&
5807 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5808 return false;
5809 // DS instructions with 2 src operands also must have tied RC.
5810 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5811 AMDGPU::OpName::data1);
5812 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5813 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5814 return false;
5815 }
5816 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5817 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5818 RI.isSGPRReg(MRI, MO->getReg()))
5819 return false;
5820 return true;
5821 }
5822
5823 if (MO->isImm()) {
5824 uint64_t Imm = MO->getImm();
5825 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5826 bool Is64BitOp = Is64BitFPOp ||
5830 if (Is64BitOp &&
5832 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
5833 return false;
5834
5835 // FIXME: We can use sign extended 64-bit literals, but only for signed
5836 // operands. At the moment we do not know if an operand is signed.
5837 // Such operand will be encoded as its low 32 bits and then either
5838 // correctly sign extended or incorrectly zero extended by HW.
5839 if (!Is64BitFPOp && (int32_t)Imm < 0)
5840 return false;
5841 }
5842 }
5843
5844 // Handle non-register types that are treated like immediates.
5845 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
5846
5847 if (!DefinedRC) {
5848 // This operand expects an immediate.
5849 return true;
5850 }
5851
5852 return isImmOperandLegal(MI, OpIdx, *MO);
5853}
5854
5856 MachineInstr &MI) const {
5857 unsigned Opc = MI.getOpcode();
5858 const MCInstrDesc &InstrDesc = get(Opc);
5859
5860 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5861 MachineOperand &Src0 = MI.getOperand(Src0Idx);
5862
5863 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5864 MachineOperand &Src1 = MI.getOperand(Src1Idx);
5865
5866 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5867 // we need to only have one constant bus use before GFX10.
5868 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
5869 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
5870 RI.isSGPRReg(MRI, Src0.getReg()))
5871 legalizeOpWithMove(MI, Src0Idx);
5872
5873 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5874 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5875 // src0/src1 with V_READFIRSTLANE.
5876 if (Opc == AMDGPU::V_WRITELANE_B32) {
5877 const DebugLoc &DL = MI.getDebugLoc();
5878 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
5879 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5880 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5881 .add(Src0);
5882 Src0.ChangeToRegister(Reg, false);
5883 }
5884 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
5885 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5886 const DebugLoc &DL = MI.getDebugLoc();
5887 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5888 .add(Src1);
5889 Src1.ChangeToRegister(Reg, false);
5890 }
5891 return;
5892 }
5893
5894 // No VOP2 instructions support AGPRs.
5895 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
5896 legalizeOpWithMove(MI, Src0Idx);
5897
5898 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
5899 legalizeOpWithMove(MI, Src1Idx);
5900
5901 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
5902 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
5903 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5904 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
5905 legalizeOpWithMove(MI, Src2Idx);
5906 }
5907
5908 // VOP2 src0 instructions support all operand types, so we don't need to check
5909 // their legality. If src1 is already legal, we don't need to do anything.
5910 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
5911 return;
5912
5913 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
5914 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
5915 // select is uniform.
5916 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
5917 RI.isVGPR(MRI, Src1.getReg())) {
5918 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5919 const DebugLoc &DL = MI.getDebugLoc();
5920 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5921 .add(Src1);
5922 Src1.ChangeToRegister(Reg, false);
5923 return;
5924 }
5925
5926 // We do not use commuteInstruction here because it is too aggressive and will
5927 // commute if it is possible. We only want to commute here if it improves
5928 // legality. This can be called a fairly large number of times so don't waste
5929 // compile time pointlessly swapping and checking legality again.
5930 if (HasImplicitSGPR || !MI.isCommutable()) {
5931 legalizeOpWithMove(MI, Src1Idx);
5932 return;
5933 }
5934
5935 // If src0 can be used as src1, commuting will make the operands legal.
5936 // Otherwise we have to give up and insert a move.
5937 //
5938 // TODO: Other immediate-like operand kinds could be commuted if there was a
5939 // MachineOperand::ChangeTo* for them.
5940 if ((!Src1.isImm() && !Src1.isReg()) ||
5941 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
5942 legalizeOpWithMove(MI, Src1Idx);
5943 return;
5944 }
5945
5946 int CommutedOpc = commuteOpcode(MI);
5947 if (CommutedOpc == -1) {
5948 legalizeOpWithMove(MI, Src1Idx);
5949 return;
5950 }
5951
5952 MI.setDesc(get(CommutedOpc));
5953
5954 Register Src0Reg = Src0.getReg();
5955 unsigned Src0SubReg = Src0.getSubReg();
5956 bool Src0Kill = Src0.isKill();
5957
5958 if (Src1.isImm())
5959 Src0.ChangeToImmediate(Src1.getImm());
5960 else if (Src1.isReg()) {
5961 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
5962 Src0.setSubReg(Src1.getSubReg());
5963 } else
5964 llvm_unreachable("Should only have register or immediate operands");
5965
5966 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
5967 Src1.setSubReg(Src0SubReg);
5969}
5970
5971// Legalize VOP3 operands. All operand types are supported for any operand
5972// but only one literal constant and only starting from GFX10.
5974 MachineInstr &MI) const {
5975 unsigned Opc = MI.getOpcode();
5976
5977 int VOP3Idx[3] = {
5978 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
5979 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
5980 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
5981 };
5982
5983 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
5984 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
5985 // src1 and src2 must be scalar
5986 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
5987 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
5988 const DebugLoc &DL = MI.getDebugLoc();
5989 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
5990 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5991 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5992 .add(Src1);
5993 Src1.ChangeToRegister(Reg, false);
5994 }
5995 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
5996 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5997 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5998 .add(Src2);
5999 Src2.ChangeToRegister(Reg, false);
6000 }
6001 }
6002
6003 // Find the one SGPR operand we are allowed to use.
6004 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6005 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6006 SmallDenseSet<unsigned> SGPRsUsed;
6007 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6008 if (SGPRReg) {
6009 SGPRsUsed.insert(SGPRReg);
6010 --ConstantBusLimit;
6011 }
6012
6013 for (int Idx : VOP3Idx) {
6014 if (Idx == -1)
6015 break;
6016 MachineOperand &MO = MI.getOperand(Idx);
6017
6018 if (!MO.isReg()) {
6019 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6020 continue;
6021
6022 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6023 --LiteralLimit;
6024 --ConstantBusLimit;
6025 continue;
6026 }
6027
6028 --LiteralLimit;
6029 --ConstantBusLimit;
6031 continue;
6032 }
6033
6034 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6035 !isOperandLegal(MI, Idx, &MO)) {
6037 continue;
6038 }
6039
6040 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6041 continue; // VGPRs are legal
6042
6043 // We can use one SGPR in each VOP3 instruction prior to GFX10
6044 // and two starting from GFX10.
6045 if (SGPRsUsed.count(MO.getReg()))
6046 continue;
6047 if (ConstantBusLimit > 0) {
6048 SGPRsUsed.insert(MO.getReg());
6049 --ConstantBusLimit;
6050 continue;
6051 }
6052
6053 // If we make it this far, then the operand is not legal and we must
6054 // legalize it.
6056 }
6057
6058 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6059 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6060 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6061 legalizeOpWithMove(MI, VOP3Idx[2]);
6062}
6063
6065 MachineRegisterInfo &MRI) const {
6066 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6067 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6068 Register DstReg = MRI.createVirtualRegister(SRC);
6069 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6070
6071 if (RI.hasAGPRs(VRC)) {
6072 VRC = RI.getEquivalentVGPRClass(VRC);
6073 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6074 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6075 get(TargetOpcode::COPY), NewSrcReg)
6076 .addReg(SrcReg);
6077 SrcReg = NewSrcReg;
6078 }
6079
6080 if (SubRegs == 1) {
6081 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6082 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6083 .addReg(SrcReg);
6084 return DstReg;
6085 }
6086
6088 for (unsigned i = 0; i < SubRegs; ++i) {
6089 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6090 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6091 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6092 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6093 SRegs.push_back(SGPR);
6094 }
6095
6097 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6098 get(AMDGPU::REG_SEQUENCE), DstReg);
6099 for (unsigned i = 0; i < SubRegs; ++i) {
6100 MIB.addReg(SRegs[i]);
6101 MIB.addImm(RI.getSubRegFromChannel(i));
6102 }
6103 return DstReg;
6104}
6105
6107 MachineInstr &MI) const {
6108
6109 // If the pointer is store in VGPRs, then we need to move them to
6110 // SGPRs using v_readfirstlane. This is safe because we only select
6111 // loads with uniform pointers to SMRD instruction so we know the
6112 // pointer value is uniform.
6113 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6114 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6115 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6116 SBase->setReg(SGPR);
6117 }
6118 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6119 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
6120 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6121 SOff->setReg(SGPR);
6122 }
6123}
6124
6126 unsigned Opc = Inst.getOpcode();
6127 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6128 if (OldSAddrIdx < 0)
6129 return false;
6130
6132
6133 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6134 if (NewOpc < 0)
6136 if (NewOpc < 0)
6137 return false;
6138
6140 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6141 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6142 return false;
6143
6144 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6145 if (NewVAddrIdx < 0)
6146 return false;
6147
6148 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6149
6150 // Check vaddr, it shall be zero or absent.
6151 MachineInstr *VAddrDef = nullptr;
6152 if (OldVAddrIdx >= 0) {
6153 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6154 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6155 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6156 !VAddrDef->getOperand(1).isImm() ||
6157 VAddrDef->getOperand(1).getImm() != 0)
6158 return false;
6159 }
6160
6161 const MCInstrDesc &NewDesc = get(NewOpc);
6162 Inst.setDesc(NewDesc);
6163
6164 // Callers expect iterator to be valid after this call, so modify the
6165 // instruction in place.
6166 if (OldVAddrIdx == NewVAddrIdx) {
6167 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6168 // Clear use list from the old vaddr holding a zero register.
6169 MRI.removeRegOperandFromUseList(&NewVAddr);
6170 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6171 Inst.removeOperand(OldSAddrIdx);
6172 // Update the use list with the pointer we have just moved from vaddr to
6173 // saddr position. Otherwise new vaddr will be missing from the use list.
6174 MRI.removeRegOperandFromUseList(&NewVAddr);
6175 MRI.addRegOperandToUseList(&NewVAddr);
6176 } else {
6177 assert(OldSAddrIdx == NewVAddrIdx);
6178
6179 if (OldVAddrIdx >= 0) {
6180 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6181 AMDGPU::OpName::vdst_in);
6182
6183 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6184 // it asserts. Untie the operands for now and retie them afterwards.
6185 if (NewVDstIn != -1) {
6186 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6187 Inst.untieRegOperand(OldVDstIn);
6188 }
6189
6190 Inst.removeOperand(OldVAddrIdx);
6191
6192 if (NewVDstIn != -1) {
6193 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6194 Inst.tieOperands(NewVDst, NewVDstIn);
6195 }
6196 }
6197 }
6198
6199 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6200 VAddrDef->eraseFromParent();
6201
6202 return true;
6203}
6204
6205// FIXME: Remove this when SelectionDAG is obsoleted.
6207 MachineInstr &MI) const {
6209 return;
6210
6211 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6212 // thinks they are uniform, so a readfirstlane should be valid.
6213 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6214 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6215 return;
6216
6218 return;
6219
6220 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
6221 SAddr->setReg(ToSGPR);
6222}
6223
6226 const TargetRegisterClass *DstRC,
6229 const DebugLoc &DL) const {
6230 Register OpReg = Op.getReg();
6231 unsigned OpSubReg = Op.getSubReg();
6232
6233 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6234 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6235
6236 // Check if operand is already the correct register class.
6237 if (DstRC == OpRC)
6238 return;
6239
6240 Register DstReg = MRI.createVirtualRegister(DstRC);
6241 auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
6242
6243 Op.setReg(DstReg);
6244 Op.setSubReg(0);
6245
6246 MachineInstr *Def = MRI.getVRegDef(OpReg);
6247 if (!Def)
6248 return;
6249
6250 // Try to eliminate the copy if it is copying an immediate value.
6251 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6252 foldImmediate(*Copy, *Def, OpReg, &MRI);
6253
6254 bool ImpDef = Def->isImplicitDef();
6255 while (!ImpDef && Def && Def->isCopy()) {
6256 if (Def->getOperand(1).getReg().isPhysical())
6257 break;
6258 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6259 ImpDef = Def && Def->isImplicitDef();
6260 }
6261 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6262 !ImpDef)
6263 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6264}
6265
6266// Emit the actual waterfall loop, executing the wrapped instruction for each
6267// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6268// iteration, in the worst case we execute 64 (once per lane).
6271 MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL,
6272 ArrayRef<MachineOperand *> ScalarOps) {
6273 MachineFunction &MF = *OrigBB.getParent();
6274 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6275 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6276 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6277 unsigned SaveExecOpc =
6278 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6279 unsigned XorTermOpc =
6280 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6281 unsigned AndOpc =
6282 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6283 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6284
6286
6287 SmallVector<Register, 8> ReadlanePieces;
6288 Register CondReg;
6289
6290 for (MachineOperand *ScalarOp : ScalarOps) {
6291 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6292 unsigned NumSubRegs = RegSize / 32;
6293 Register VScalarOp = ScalarOp->getReg();
6294
6295 if (NumSubRegs == 1) {
6296 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6297
6298 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6299 .addReg(VScalarOp);
6300
6301 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6302
6303 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6304 .addReg(CurReg)
6305 .addReg(VScalarOp);
6306
6307 // Combine the comparison results with AND.
6308 if (!CondReg) // First.
6309 CondReg = NewCondReg;
6310 else { // If not the first, we create an AND.
6311 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6312 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6313 .addReg(CondReg)
6314 .addReg(NewCondReg);
6315 CondReg = AndReg;
6316 }
6317
6318 // Update ScalarOp operand to use the SGPR ScalarOp.
6319 ScalarOp->setReg(CurReg);
6320 ScalarOp->setIsKill();
6321 } else {
6322 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6323 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6324 "Unhandled register size");
6325
6326 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6327 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6328 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6329
6330 // Read the next variant <- also loop target.
6331 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6332 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6333
6334 // Read the next variant <- also loop target.
6335 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6336 .addReg(VScalarOp, VScalarOpUndef,
6337 TRI->getSubRegFromChannel(Idx + 1));
6338
6339 ReadlanePieces.push_back(CurRegLo);
6340 ReadlanePieces.push_back(CurRegHi);
6341
6342 // Comparison is to be done as 64-bit.
6343 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6344 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6345 .addReg(CurRegLo)
6346 .addImm(AMDGPU::sub0)
6347 .addReg(CurRegHi)
6348 .addImm(AMDGPU::sub1);
6349
6350 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6351 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6352 NewCondReg)
6353 .addReg(CurReg);
6354 if (NumSubRegs <= 2)
6355 Cmp.addReg(VScalarOp);
6356 else
6357 Cmp.addReg(VScalarOp, VScalarOpUndef,
6358 TRI->getSubRegFromChannel(Idx, 2));
6359
6360 // Combine the comparison results with AND.
6361 if (!CondReg) // First.
6362 CondReg = NewCondReg;
6363 else { // If not the first, we create an AND.
6364 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6365 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6366 .addReg(CondReg)
6367 .addReg(NewCondReg);
6368 CondReg = AndReg;
6369 }
6370 } // End for loop.
6371
6372 auto SScalarOpRC =
6373 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6374 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6375
6376 // Build scalar ScalarOp.
6377 auto Merge =
6378 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6379 unsigned Channel = 0;
6380 for (Register Piece : ReadlanePieces) {
6381 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6382 }
6383
6384 // Update ScalarOp operand to use the SGPR ScalarOp.
6385 ScalarOp->setReg(SScalarOp);
6386 ScalarOp->setIsKill();
6387 }
6388 }
6389
6390 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6391 MRI.setSimpleHint(SaveExec, CondReg);
6392
6393 // Update EXEC to matching lanes, saving original to SaveExec.
6394 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6395 .addReg(CondReg, RegState::Kill);
6396
6397 // The original instruction is here; we insert the terminators after it.
6398 I = BodyBB.end();
6399
6400 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6401 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6402 .addReg(Exec)
6403 .addReg(SaveExec);
6404
6405 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6406}
6407
6408// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6409// with SGPRs by iterating over all unique values across all lanes.
6410// Returns the loop basic block that now contains \p MI.
6411static MachineBasicBlock *
6415 MachineBasicBlock::iterator Begin = nullptr,
6416 MachineBasicBlock::iterator End = nullptr) {
6417 MachineBasicBlock &MBB = *MI.getParent();
6418 MachineFunction &MF = *MBB.getParent();
6419 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6420 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6422 if (!Begin.isValid())
6423 Begin = &MI;
6424 if (!End.isValid()) {
6425 End = &MI;
6426 ++End;
6427 }
6428 const DebugLoc &DL = MI.getDebugLoc();
6429 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6430 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6431 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6432
6433 // Save SCC. Waterfall Loop may overwrite SCC.
6434 Register SaveSCCReg;
6435 bool SCCNotDead = (MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI, 30) !=
6437 if (SCCNotDead) {
6438 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6439 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6440 .addImm(1)
6441 .addImm(0);
6442 }
6443
6444 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6445
6446 // Save the EXEC mask
6447 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6448
6449 // Killed uses in the instruction we are waterfalling around will be
6450 // incorrect due to the added control-flow.
6452 ++AfterMI;
6453 for (auto I = Begin; I != AfterMI; I++) {
6454 for (auto &MO : I->all_uses())
6455 MRI.clearKillFlags(MO.getReg());
6456 }
6457
6458 // To insert the loop we need to split the block. Move everything after this
6459 // point to a new block, and insert a new empty block between the two.
6462 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6464 ++MBBI;
6465
6466 MF.insert(MBBI, LoopBB);
6467 MF.insert(MBBI, BodyBB);
6468 MF.insert(MBBI, RemainderBB);
6469
6470 LoopBB->addSuccessor(BodyBB);
6471 BodyBB->addSuccessor(LoopBB);
6472 BodyBB->addSuccessor(RemainderBB);
6473
6474 // Move Begin to MI to the BodyBB, and the remainder of the block to
6475 // RemainderBB.
6476 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6477 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6478 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6479
6480 MBB.addSuccessor(LoopBB);
6481
6482 // Update dominators. We know that MBB immediately dominates LoopBB, that
6483 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6484 // RemainderBB. RemainderBB immediately dominates all of the successors
6485 // transferred to it from MBB that MBB used to properly dominate.
6486 if (MDT) {
6487 MDT->addNewBlock(LoopBB, &MBB);
6488 MDT->addNewBlock(BodyBB, LoopBB);
6489 MDT->addNewBlock(RemainderBB, BodyBB);
6490 for (auto &Succ : RemainderBB->successors()) {
6491 if (MDT->properlyDominates(&MBB, Succ)) {
6492 MDT->changeImmediateDominator(Succ, RemainderBB);
6493 }
6494 }
6495 }
6496
6497 emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps);
6498
6499 MachineBasicBlock::iterator First = RemainderBB->begin();
6500 // Restore SCC
6501 if (SCCNotDead) {
6502 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6503 .addReg(SaveSCCReg, RegState::Kill)
6504 .addImm(0);
6505 }
6506
6507 // Restore the EXEC mask
6508 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6509 return BodyBB;
6510}
6511
6512// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6513static std::tuple<unsigned, unsigned>
6515 MachineBasicBlock &MBB = *MI.getParent();
6516 MachineFunction &MF = *MBB.getParent();
6518
6519 // Extract the ptr from the resource descriptor.
6520 unsigned RsrcPtr =
6521 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6522 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6523
6524 // Create an empty resource descriptor
6525 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6526 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6527 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6528 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6529 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6530
6531 // Zero64 = 0
6532 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6533 .addImm(0);
6534
6535 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6536 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6537 .addImm(RsrcDataFormat & 0xFFFFFFFF);
6538
6539 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6540 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6541 .addImm(RsrcDataFormat >> 32);
6542
6543 // NewSRsrc = {Zero64, SRsrcFormat}
6544 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6545 .addReg(Zero64)
6546 .addImm(AMDGPU::sub0_sub1)
6547 .addReg(SRsrcFormatLo)
6548 .addImm(AMDGPU::sub2)
6549 .addReg(SRsrcFormatHi)
6550 .addImm(AMDGPU::sub3);
6551
6552 return std::tuple(RsrcPtr, NewSRsrc);
6553}
6554
6557 MachineDominatorTree *MDT) const {
6558 MachineFunction &MF = *MI.getParent()->getParent();
6560 MachineBasicBlock *CreatedBB = nullptr;
6561
6562 // Legalize VOP2
6563 if (isVOP2(MI) || isVOPC(MI)) {
6565 return CreatedBB;
6566 }
6567
6568 // Legalize VOP3
6569 if (isVOP3(MI)) {
6571 return CreatedBB;
6572 }
6573
6574 // Legalize SMRD
6575 if (isSMRD(MI)) {
6577 return CreatedBB;
6578 }
6579
6580 // Legalize FLAT
6581 if (isFLAT(MI)) {
6583 return CreatedBB;
6584 }
6585
6586 // Legalize REG_SEQUENCE and PHI
6587 // The register class of the operands much be the same type as the register
6588 // class of the output.
6589 if (MI.getOpcode() == AMDGPU::PHI) {
6590 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6591 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6592 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6593 continue;
6594 const TargetRegisterClass *OpRC =
6595 MRI.getRegClass(MI.getOperand(i).getReg());
6596 if (RI.hasVectorRegisters(OpRC)) {
6597 VRC = OpRC;
6598 } else {
6599 SRC = OpRC;
6600 }
6601 }
6602
6603 // If any of the operands are VGPR registers, then they all most be
6604 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6605 // them.
6606 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6607 if (!VRC) {
6608 assert(SRC);
6609 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6610 VRC = &AMDGPU::VReg_1RegClass;
6611 } else
6612 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6613 ? RI.getEquivalentAGPRClass(SRC)
6614 : RI.getEquivalentVGPRClass(SRC);
6615 } else {
6616 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6617 ? RI.getEquivalentAGPRClass(VRC)
6618 : RI.getEquivalentVGPRClass(VRC);
6619 }
6620 RC = VRC;
6621 } else {
6622 RC = SRC;
6623 }
6624
6625 // Update all the operands so they have the same type.
6626 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6627 MachineOperand &Op = MI.getOperand(I);
6628 if (!Op.isReg() || !Op.getReg().isVirtual())
6629 continue;
6630
6631 // MI is a PHI instruction.
6632 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6634
6635 // Avoid creating no-op copies with the same src and dst reg class. These
6636 // confuse some of the machine passes.
6637 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6638 }
6639 }
6640
6641 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6642 // VGPR dest type and SGPR sources, insert copies so all operands are
6643 // VGPRs. This seems to help operand folding / the register coalescer.
6644 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6645 MachineBasicBlock *MBB = MI.getParent();
6646 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6647 if (RI.hasVGPRs(DstRC)) {
6648 // Update all the operands so they are VGPR register classes. These may
6649 // not be the same register class because REG_SEQUENCE supports mixing
6650 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6651 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6652 MachineOperand &Op = MI.getOperand(I);
6653 if (!Op.isReg() || !Op.getReg().isVirtual())
6654 continue;
6655
6656 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6657 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6658 if (VRC == OpRC)
6659 continue;
6660
6661 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6662 Op.setIsKill();
6663 }
6664 }
6665
6666 return CreatedBB;
6667 }
6668
6669 // Legalize INSERT_SUBREG
6670 // src0 must have the same register class as dst
6671 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6672 Register Dst = MI.getOperand(0).getReg();
6673 Register Src0 = MI.getOperand(1).getReg();
6674 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6675 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6676 if (DstRC != Src0RC) {
6677 MachineBasicBlock *MBB = MI.getParent();
6678 MachineOperand &Op = MI.getOperand(1);
6679 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6680 }
6681 return CreatedBB;
6682 }
6683
6684 // Legalize SI_INIT_M0
6685 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6686 MachineOperand &Src = MI.getOperand(0);
6687 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6688 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6689 return CreatedBB;
6690 }
6691
6692 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6693 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6694 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6695 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6696 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6697 MI.getOpcode() == AMDGPU::S_WQM_B64) {
6698 MachineOperand &Src = MI.getOperand(1);
6699 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6700 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6701 return CreatedBB;
6702 }
6703
6704 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6705 //
6706 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6707 // scratch memory access. In both cases, the legalization never involves
6708 // conversion to the addr64 form.
6710 (isMUBUF(MI) || isMTBUF(MI)))) {
6711 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6712 : AMDGPU::OpName::srsrc;
6713 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6714 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6715 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6716
6717 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6718 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6719 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6720 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6721
6722 return CreatedBB;
6723 }
6724
6725 // Legalize SI_CALL
6726 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6727 MachineOperand *Dest = &MI.getOperand(0);
6728 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6729 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6730 // following copies, we also need to move copies from and to physical
6731 // registers into the loop block.
6732 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6733 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6734
6735 // Also move the copies to physical registers into the loop block
6736 MachineBasicBlock &MBB = *MI.getParent();
6738 while (Start->getOpcode() != FrameSetupOpcode)
6739 --Start;
6741 while (End->getOpcode() != FrameDestroyOpcode)
6742 ++End;
6743 // Also include following copies of the return value
6744 ++End;
6745 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6746 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
6747 ++End;
6748 CreatedBB =
6749 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6750 }
6751 }
6752
6753 // Legalize s_sleep_var.
6754 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6755 const DebugLoc &DL = MI.getDebugLoc();
6756 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6757 int Src0Idx =
6758 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6759 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6760 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6761 .add(Src0);
6762 Src0.ChangeToRegister(Reg, false);
6763 return nullptr;
6764 }
6765
6766 // Legalize MUBUF instructions.
6767 bool isSoffsetLegal = true;
6768 int SoffsetIdx =
6769 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6770 if (SoffsetIdx != -1) {
6771 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6772 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6773 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6774 isSoffsetLegal = false;
6775 }
6776 }
6777
6778 bool isRsrcLegal = true;
6779 int RsrcIdx =
6780 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6781 if (RsrcIdx != -1) {
6782 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6783 if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
6784 isRsrcLegal = false;
6785 }
6786 }
6787
6788 // The operands are legal.
6789 if (isRsrcLegal && isSoffsetLegal)
6790 return CreatedBB;
6791
6792 if (!isRsrcLegal) {
6793 // Legalize a VGPR Rsrc
6794 //
6795 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6796 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6797 // a zero-value SRsrc.
6798 //
6799 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6800 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6801 // above.
6802 //
6803 // Otherwise we are on non-ADDR64 hardware, and/or we have
6804 // idxen/offen/bothen and we fall back to a waterfall loop.
6805
6806 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6807 MachineBasicBlock &MBB = *MI.getParent();
6808
6809 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6810 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6811 // This is already an ADDR64 instruction so we need to add the pointer
6812 // extracted from the resource descriptor to the current value of VAddr.
6813 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6814 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6815 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6816
6817 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6818 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
6819 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
6820
6821 unsigned RsrcPtr, NewSRsrc;
6822 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6823
6824 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
6825 const DebugLoc &DL = MI.getDebugLoc();
6826 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
6827 .addDef(CondReg0)
6828 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6829 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
6830 .addImm(0);
6831
6832 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
6833 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
6834 .addDef(CondReg1, RegState::Dead)
6835 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6836 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
6837 .addReg(CondReg0, RegState::Kill)
6838 .addImm(0);
6839
6840 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6841 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
6842 .addReg(NewVAddrLo)
6843 .addImm(AMDGPU::sub0)
6844 .addReg(NewVAddrHi)
6845 .addImm(AMDGPU::sub1);
6846
6847 VAddr->setReg(NewVAddr);
6848 Rsrc->setReg(NewSRsrc);
6849 } else if (!VAddr && ST.hasAddr64()) {
6850 // This instructions is the _OFFSET variant, so we need to convert it to
6851 // ADDR64.
6853 "FIXME: Need to emit flat atomics here");
6854
6855 unsigned RsrcPtr, NewSRsrc;
6856 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6857
6858 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6859 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
6860 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
6861 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6862 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
6863
6864 // Atomics with return have an additional tied operand and are
6865 // missing some of the special bits.
6866 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
6867 MachineInstr *Addr64;
6868
6869 if (!VDataIn) {
6870 // Regular buffer load / store.
6872 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6873 .add(*VData)
6874 .addReg(NewVAddr)
6875 .addReg(NewSRsrc)
6876 .add(*SOffset)
6877 .add(*Offset);
6878
6879 if (const MachineOperand *CPol =
6880 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6881 MIB.addImm(CPol->getImm());
6882 }
6883
6884 if (const MachineOperand *TFE =
6885 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
6886 MIB.addImm(TFE->getImm());
6887 }
6888
6889 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
6890
6891 MIB.cloneMemRefs(MI);
6892 Addr64 = MIB;
6893 } else {
6894 // Atomics with return.
6895 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6896 .add(*VData)
6897 .add(*VDataIn)
6898 .addReg(NewVAddr)
6899 .addReg(NewSRsrc)
6900 .add(*SOffset)
6901 .add(*Offset)
6902 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
6903 .cloneMemRefs(MI);
6904 }
6905
6906 MI.removeFromParent();
6907
6908 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6909 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
6910 NewVAddr)
6911 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6912 .addImm(AMDGPU::sub0)
6913 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6914 .addImm(AMDGPU::sub1);
6915 } else {
6916 // Legalize a VGPR Rsrc and soffset together.
6917 if (!isSoffsetLegal) {
6918 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6919 CreatedBB =
6920 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
6921 return CreatedBB;
6922 }
6923 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
6924 return CreatedBB;
6925 }
6926 }
6927
6928 // Legalize a VGPR soffset.
6929 if (!isSoffsetLegal) {
6930 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6931 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
6932 return CreatedBB;
6933 }
6934 return CreatedBB;
6935}
6936
6938 InstrList.insert(MI);
6939 // Add MBUF instructiosn to deferred list.
6940 int RsrcIdx =
6941 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
6942 if (RsrcIdx != -1) {
6943 DeferredList.insert(MI);
6944 }
6945}
6946
6948 return DeferredList.contains(MI);
6949}
6950
6952 MachineDominatorTree *MDT) const {
6953
6954 while (!Worklist.empty()) {
6955 MachineInstr &Inst = *Worklist.top();
6956 Worklist.erase_top();
6957 // Skip MachineInstr in the deferred list.
6958 if (Worklist.isDeferred(&Inst))
6959 continue;
6960 moveToVALUImpl(Worklist, MDT, Inst);
6961 }
6962
6963 // Deferred list of instructions will be processed once
6964 // all the MachineInstr in the worklist are done.
6965 for (MachineInstr *Inst : Worklist.getDeferredList()) {
6966 moveToVALUImpl(Worklist, MDT, *Inst);
6967 assert(Worklist.empty() &&
6968 "Deferred MachineInstr are not supposed to re-populate worklist");
6969 }
6970}
6971
6974 MachineInstr &Inst) const {
6975
6977 if (!MBB)
6978 return;
6980 unsigned Opcode = Inst.getOpcode();
6981 unsigned NewOpcode = getVALUOp(Inst);
6982 // Handle some special cases
6983 switch (Opcode) {
6984 default:
6985 break;
6986 case AMDGPU::S_ADD_U64_PSEUDO:
6987 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
6988 break;
6989 case AMDGPU::S_SUB_U64_PSEUDO:
6990 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
6991 break;
6992 case AMDGPU::S_ADD_I32:
6993 case AMDGPU::S_SUB_I32: {
6994 // FIXME: The u32 versions currently selected use the carry.
6995 bool Changed;
6996 MachineBasicBlock *CreatedBBTmp = nullptr;
6997 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
6998 if (Changed)
6999 return;
7000
7001 // Default handling
7002 break;
7003 }
7004
7005 case AMDGPU::S_MUL_U64:
7006 // Split s_mul_u64 in 32-bit vector multiplications.
7007 splitScalarSMulU64(Worklist, Inst, MDT);
7008 Inst.eraseFromParent();
7009 return;
7010
7011 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7012 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7013 // This is a special case of s_mul_u64 where all the operands are either
7014 // zero extended or sign extended.
7015 splitScalarSMulPseudo(Worklist, Inst, MDT);
7016 Inst.eraseFromParent();
7017 return;
7018
7019 case AMDGPU::S_AND_B64:
7020 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7021 Inst.eraseFromParent();
7022 return;
7023
7024 case AMDGPU::S_OR_B64:
7025 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7026 Inst.eraseFromParent();
7027 return;
7028
7029 case AMDGPU::S_XOR_B64:
7030 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7031 Inst.eraseFromParent();
7032 return;
7033
7034 case AMDGPU::S_NAND_B64:
7035 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7036 Inst.eraseFromParent();
7037 return;
7038
7039 case AMDGPU::S_NOR_B64:
7040 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7041 Inst.eraseFromParent();
7042 return;
7043
7044 case AMDGPU::S_XNOR_B64:
7045 if (ST.hasDLInsts())
7046 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7047 else
7048 splitScalar64BitXnor(Worklist, Inst, MDT);
7049 Inst.eraseFromParent();
7050 return;
7051
7052 case AMDGPU::S_ANDN2_B64:
7053 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7054 Inst.eraseFromParent();
7055 return;
7056
7057 case AMDGPU::S_ORN2_B64:
7058 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7059 Inst.eraseFromParent();
7060 return;
7061
7062 case AMDGPU::S_BREV_B64:
7063 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7064 Inst.eraseFromParent();
7065 return;
7066
7067 case AMDGPU::S_NOT_B64:
7068 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7069 Inst.eraseFromParent();
7070 return;
7071
7072 case AMDGPU::S_BCNT1_I32_B64:
7073 splitScalar64BitBCNT(Worklist, Inst);
7074 Inst.eraseFromParent();
7075 return;
7076
7077 case AMDGPU::S_BFE_I64:
7078 splitScalar64BitBFE(Worklist, Inst);
7079 Inst.eraseFromParent();
7080 return;
7081
7082 case AMDGPU::S_FLBIT_I32_B64:
7083 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7084 Inst.eraseFromParent();
7085 return;
7086 case AMDGPU::S_FF1_I32_B64:
7087 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7088 Inst.eraseFromParent();
7089 return;
7090
7091 case AMDGPU::S_LSHL_B32:
7092 if (ST.hasOnlyRevVALUShifts()) {
7093 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7094 swapOperands(Inst);
7095 }
7096 break;
7097 case AMDGPU::S_ASHR_I32:
7098 if (ST.hasOnlyRevVALUShifts()) {
7099 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7100 swapOperands(Inst);
7101 }
7102 break;
7103 case AMDGPU::S_LSHR_B32:
7104 if (ST.hasOnlyRevVALUShifts()) {
7105 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7106 swapOperands(Inst);
7107 }
7108 break;
7109 case AMDGPU::S_LSHL_B64:
7110 if (ST.hasOnlyRevVALUShifts()) {
7111 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7112 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7113 : AMDGPU::V_LSHLREV_B64_e64;
7114 swapOperands(Inst);
7115 }
7116 break;
7117 case AMDGPU::S_ASHR_I64:
7118 if (ST.hasOnlyRevVALUShifts()) {
7119 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7120 swapOperands(Inst);
7121 }
7122 break;
7123 case AMDGPU::S_LSHR_B64:
7124 if (ST.hasOnlyRevVALUShifts()) {
7125 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7126 swapOperands(Inst);
7127 }
7128 break;
7129
7130 case AMDGPU::S_ABS_I32:
7131 lowerScalarAbs(Worklist, Inst);
7132 Inst.eraseFromParent();
7133 return;
7134
7135 case AMDGPU::S_CBRANCH_SCC0:
7136 case AMDGPU::S_CBRANCH_SCC1: {
7137 // Clear unused bits of vcc
7138 Register CondReg = Inst.getOperand(1).getReg();
7139 bool IsSCC = CondReg == AMDGPU::SCC;
7140 Register VCC = RI.getVCC();
7141 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7142 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7143 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7144 .addReg(EXEC)
7145 .addReg(IsSCC ? VCC : CondReg);
7146 Inst.removeOperand(1);
7147 } break;
7148
7149 case AMDGPU::S_BFE_U64:
7150 case AMDGPU::S_BFM_B64:
7151 llvm_unreachable("Moving this op to VALU not implemented");
7152
7153 case AMDGPU::S_PACK_LL_B32_B16:
7154 case AMDGPU::S_PACK_LH_B32_B16:
7155 case AMDGPU::S_PACK_HL_B32_B16:
7156 case AMDGPU::S_PACK_HH_B32_B16:
7157 movePackToVALU(Worklist, MRI, Inst);
7158 Inst.eraseFromParent();
7159 return;
7160
7161 case AMDGPU::S_XNOR_B32:
7162 lowerScalarXnor(Worklist, Inst);
7163 Inst.eraseFromParent();
7164 return;
7165
7166 case AMDGPU::S_NAND_B32:
7167 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7168 Inst.eraseFromParent();
7169 return;
7170
7171 case AMDGPU::S_NOR_B32:
7172 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7173 Inst.eraseFromParent();
7174 return;
7175
7176 case AMDGPU::S_ANDN2_B32:
7177 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7178 Inst.eraseFromParent();
7179 return;
7180
7181 case AMDGPU::S_ORN2_B32:
7182 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7183 Inst.eraseFromParent();
7184 return;
7185
7186 // TODO: remove as soon as everything is ready
7187 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7188 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7189 // can only be selected from the uniform SDNode.
7190 case AMDGPU::S_ADD_CO_PSEUDO:
7191 case AMDGPU::S_SUB_CO_PSEUDO: {
7192 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7193 ? AMDGPU::V_ADDC_U32_e64
7194 : AMDGPU::V_SUBB_U32_e64;
7195 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7196
7197 Register CarryInReg = Inst.getOperand(4).getReg();
7198 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7199 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7200 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7201 .addReg(CarryInReg);
7202 }
7203
7204 Register CarryOutReg = Inst.getOperand(1).getReg();
7205
7206 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7207 MRI.getRegClass(Inst.getOperand(0).getReg())));
7208 MachineInstr *CarryOp =
7209 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7210 .addReg(CarryOutReg, RegState::Define)
7211 .add(Inst.getOperand(2))
7212 .add(Inst.getOperand(3))
7213 .addReg(CarryInReg)
7214 .addImm(0);
7215 legalizeOperands(*CarryOp);
7216 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7217 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7218 Inst.eraseFromParent();
7219 }
7220 return;
7221 case AMDGPU::S_UADDO_PSEUDO:
7222 case AMDGPU::S_USUBO_PSEUDO: {
7223 const DebugLoc &DL = Inst.getDebugLoc();
7224 MachineOperand &Dest0 = Inst.getOperand(0);
7225 MachineOperand &Dest1 = Inst.getOperand(1);
7226 MachineOperand &Src0 = Inst.getOperand(2);
7227 MachineOperand &Src1 = Inst.getOperand(3);
7228
7229 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7230 ? AMDGPU::V_ADD_CO_U32_e64
7231 : AMDGPU::V_SUB_CO_U32_e64;
7232 const TargetRegisterClass *NewRC =
7233 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7234 Register DestReg = MRI.createVirtualRegister(NewRC);
7235 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7236 .addReg(Dest1.getReg(), RegState::Define)
7237 .add(Src0)
7238 .add(Src1)
7239 .addImm(0); // clamp bit
7240
7241 legalizeOperands(*NewInstr, MDT);
7242 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7243 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7244 Worklist);
7245 Inst.eraseFromParent();
7246 }
7247 return;
7248
7249 case AMDGPU::S_CSELECT_B32:
7250 case AMDGPU::S_CSELECT_B64:
7251 lowerSelect(Worklist, Inst, MDT);
7252 Inst.eraseFromParent();
7253 return;
7254 case AMDGPU::S_CMP_EQ_I32:
7255 case AMDGPU::S_CMP_LG_I32:
7256 case AMDGPU::S_CMP_GT_I32:
7257 case AMDGPU::S_CMP_GE_I32:
7258 case AMDGPU::S_CMP_LT_I32:
7259 case AMDGPU::S_CMP_LE_I32:
7260 case AMDGPU::S_CMP_EQ_U32:
7261 case AMDGPU::S_CMP_LG_U32:
7262 case AMDGPU::S_CMP_GT_U32:
7263 case AMDGPU::S_CMP_GE_U32:
7264 case AMDGPU::S_CMP_LT_U32:
7265 case AMDGPU::S_CMP_LE_U32:
7266 case AMDGPU::S_CMP_EQ_U64:
7267 case AMDGPU::S_CMP_LG_U64:
7268 case AMDGPU::S_CMP_LT_F32:
7269 case AMDGPU::S_CMP_EQ_F32:
7270 case AMDGPU::S_CMP_LE_F32:
7271 case AMDGPU::S_CMP_GT_F32:
7272 case AMDGPU::S_CMP_LG_F32:
7273 case AMDGPU::S_CMP_GE_F32:
7274 case AMDGPU::S_CMP_O_F32:
7275 case AMDGPU::S_CMP_U_F32:
7276 case AMDGPU::S_CMP_NGE_F32:
7277 case AMDGPU::S_CMP_NLG_F32:
7278 case AMDGPU::S_CMP_NGT_F32:
7279 case AMDGPU::S_CMP_NLE_F32:
7280 case AMDGPU::S_CMP_NEQ_F32:
7281 case AMDGPU::S_CMP_NLT_F32:
7282 case AMDGPU::S_CMP_LT_F16:
7283 case AMDGPU::S_CMP_EQ_F16:
7284 case AMDGPU::S_CMP_LE_F16:
7285 case AMDGPU::S_CMP_GT_F16:
7286 case AMDGPU::S_CMP_LG_F16:
7287 case AMDGPU::S_CMP_GE_F16:
7288 case AMDGPU::S_CMP_O_F16:
7289 case AMDGPU::S_CMP_U_F16:
7290 case AMDGPU::S_CMP_NGE_F16:
7291 case AMDGPU::S_CMP_NLG_F16:
7292 case AMDGPU::S_CMP_NGT_F16:
7293 case AMDGPU::S_CMP_NLE_F16:
7294 case AMDGPU::S_CMP_NEQ_F16:
7295 case AMDGPU::S_CMP_NLT_F16: {
7296 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7297 auto NewInstr =
7298 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7299 .setMIFlags(Inst.getFlags());
7300 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7301 AMDGPU::OpName::src0_modifiers) >= 0) {
7302 NewInstr
7303 .addImm(0) // src0_modifiers
7304 .add(Inst.getOperand(0)) // src0
7305 .addImm(0) // src1_modifiers
7306 .add(Inst.getOperand(1)) // src1
7307 .addImm(0); // clamp
7308 } else {
7309 NewInstr
7310 .add(Inst.getOperand(0))
7311 .add(Inst.getOperand(1));
7312 }
7313 legalizeOperands(*NewInstr, MDT);
7314 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7315 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7316 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7317 Inst.eraseFromParent();
7318 return;
7319 }
7320 case AMDGPU::S_CVT_HI_F32_F16: {
7321 const DebugLoc &DL = Inst.getDebugLoc();
7322 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7323 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7324 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7325 .addImm(16)
7326 .add(Inst.getOperand(1));
7327 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7328 .addImm(0) // src0_modifiers
7329 .addReg(TmpReg)
7330 .addImm(0) // clamp
7331 .addImm(0); // omod
7332
7333 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7334 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7335 Inst.eraseFromParent();
7336 return;
7337 }
7338 case AMDGPU::S_MINIMUM_F32:
7339 case AMDGPU::S_MAXIMUM_F32:
7340 case AMDGPU::S_MINIMUM_F16:
7341 case AMDGPU::S_MAXIMUM_F16: {
7342 const DebugLoc &DL = Inst.getDebugLoc();
7343 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7344 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7345 .addImm(0) // src0_modifiers
7346 .add(Inst.getOperand(1))
7347 .addImm(0) // src1_modifiers
7348 .add(Inst.getOperand(2))
7349 .addImm(0) // clamp
7350 .addImm(0); // omod
7351 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7352
7353 legalizeOperands(*NewInstr, MDT);
7354 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7355 Inst.eraseFromParent();
7356 return;
7357 }
7358 }
7359
7360 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7361 // We cannot move this instruction to the VALU, so we should try to
7362 // legalize its operands instead.
7363 legalizeOperands(Inst, MDT);
7364 return;
7365 }
7366 // Handle converting generic instructions like COPY-to-SGPR into
7367 // COPY-to-VGPR.
7368 if (NewOpcode == Opcode) {
7369 Register DstReg = Inst.getOperand(0).getReg();
7370 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7371
7372 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7373 // hope for the best.
7374 if (Inst.isCopy() && DstReg.isPhysical() &&
7375 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7376 // TODO: Only works for 32 bit registers.
7377 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7378 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7379 .add(Inst.getOperand(1));
7380 Inst.eraseFromParent();
7381 return;
7382 }
7383
7384 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7385 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7386 // Instead of creating a copy where src and dst are the same register
7387 // class, we just replace all uses of dst with src. These kinds of
7388 // copies interfere with the heuristics MachineSink uses to decide
7389 // whether or not to split a critical edge. Since the pass assumes
7390 // that copies will end up as machine instructions and not be
7391 // eliminated.
7392 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7393 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7394 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7395 Inst.getOperand(0).setReg(DstReg);
7396 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7397 // these are deleted later, but at -O0 it would leave a suspicious
7398 // looking illegal copy of an undef register.
7399 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7400 Inst.removeOperand(I);
7401 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7402 return;
7403 }
7404 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7405 MRI.replaceRegWith(DstReg, NewDstReg);
7406 legalizeOperands(Inst, MDT);
7407 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7408 return;
7409 }
7410
7411 // Use the new VALU Opcode.
7412 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7413 .setMIFlags(Inst.getFlags());
7414 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7415 // Intersperse VOP3 modifiers among the SALU operands.
7416 NewInstr->addOperand(Inst.getOperand(0));
7417 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7418 AMDGPU::OpName::src0_modifiers) >= 0)
7419 NewInstr.addImm(0);
7420 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7421 MachineOperand Src = Inst.getOperand(1);
7422 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7423 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7424 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7425 else
7426 NewInstr->addOperand(Src);
7427 }
7428
7429 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7430 // We are converting these to a BFE, so we need to add the missing
7431 // operands for the size and offset.
7432 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7433 NewInstr.addImm(0);
7434 NewInstr.addImm(Size);
7435 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7436 // The VALU version adds the second operand to the result, so insert an
7437 // extra 0 operand.
7438 NewInstr.addImm(0);
7439 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7440 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7441 // If we need to move this to VGPRs, we need to unpack the second
7442 // operand back into the 2 separate ones for bit offset and width.
7443 assert(OffsetWidthOp.isImm() &&
7444 "Scalar BFE is only implemented for constant width and offset");
7445 uint32_t Imm = OffsetWidthOp.getImm();
7446
7447 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7448 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7449 NewInstr.addImm(Offset);
7450 NewInstr.addImm(BitWidth);
7451 } else {
7452 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7453 AMDGPU::OpName::src1_modifiers) >= 0)
7454 NewInstr.addImm(0);
7455 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7456 NewInstr->addOperand(Inst.getOperand(2));
7457 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7458 AMDGPU::OpName::src2_modifiers) >= 0)
7459 NewInstr.addImm(0);
7460 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7461 NewInstr->addOperand(Inst.getOperand(3));
7462 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7463 NewInstr.addImm(0);
7464 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7465 NewInstr.addImm(0);
7466 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7467 NewInstr.addImm(0);
7468 }
7469 } else {
7470 // Just copy the SALU operands.
7471 for (const MachineOperand &Op : Inst.explicit_operands())
7472 NewInstr->addOperand(Op);
7473 }
7474
7475 // Remove any references to SCC. Vector instructions can't read from it, and
7476 // We're just about to add the implicit use / defs of VCC, and we don't want
7477 // both.
7478 for (MachineOperand &Op : Inst.implicit_operands()) {
7479 if (Op.getReg() == AMDGPU::SCC) {
7480 // Only propagate through live-def of SCC.
7481 if (Op.isDef() && !Op.isDead())
7482 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7483 if (Op.isUse())
7484 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7485 }
7486 }
7487 Inst.eraseFromParent();
7488 Register NewDstReg;
7489 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7490 Register DstReg = NewInstr->getOperand(0).getReg();
7491 assert(DstReg.isVirtual());
7492 // Update the destination register class.
7493 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7494 assert(NewDstRC);
7495 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7496 MRI.replaceRegWith(DstReg, NewDstReg);
7497 }
7498 fixImplicitOperands(*NewInstr);
7499 // Legalize the operands
7500 legalizeOperands(*NewInstr, MDT);
7501 if (NewDstReg)
7502 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7503}
7504
7505// Add/sub require special handling to deal with carry outs.
7506std::pair<bool, MachineBasicBlock *>
7507SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7508 MachineDominatorTree *MDT) const {
7509 if (ST.hasAddNoCarry()) {
7510 // Assume there is no user of scc since we don't select this in that case.
7511 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7512 // is used.
7513
7514 MachineBasicBlock &MBB = *Inst.getParent();
7516
7517 Register OldDstReg = Inst.getOperand(0).getReg();
7518 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7519
7520 unsigned Opc = Inst.getOpcode();
7521 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7522
7523 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7524 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7525
7526 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7527 Inst.removeOperand(3);
7528
7529 Inst.setDesc(get(NewOpc));
7530 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7532 MRI.replaceRegWith(OldDstReg, ResultReg);
7533 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7534
7535 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7536 return std::pair(true, NewBB);
7537 }
7538
7539 return std::pair(false, nullptr);
7540}
7541
7542void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7543 MachineDominatorTree *MDT) const {
7544
7545 MachineBasicBlock &MBB = *Inst.getParent();
7547 MachineBasicBlock::iterator MII = Inst;
7548 DebugLoc DL = Inst.getDebugLoc();
7549
7550 MachineOperand &Dest = Inst.getOperand(0);
7551 MachineOperand &Src0 = Inst.getOperand(1);
7552 MachineOperand &Src1 = Inst.getOperand(2);
7553 MachineOperand &Cond = Inst.getOperand(3);
7554
7555 Register CondReg = Cond.getReg();
7556 bool IsSCC = (CondReg == AMDGPU::SCC);
7557
7558 // If this is a trivial select where the condition is effectively not SCC
7559 // (CondReg is a source of copy to SCC), then the select is semantically
7560 // equivalent to copying CondReg. Hence, there is no need to create
7561 // V_CNDMASK, we can just use that and bail out.
7562 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7563 (Src1.getImm() == 0)) {
7564 MRI.replaceRegWith(Dest.getReg(), CondReg);
7565 return;
7566 }
7567
7568 Register NewCondReg = CondReg;
7569 if (IsSCC) {
7570 const TargetRegisterClass *TC =
7571 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7572 NewCondReg = MRI.createVirtualRegister(TC);
7573
7574 // Now look for the closest SCC def if it is a copy
7575 // replacing the CondReg with the COPY source register
7576 bool CopyFound = false;
7577 for (MachineInstr &CandI :
7579 Inst.getParent()->rend())) {
7580 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
7581 -1) {
7582 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7583 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7584 .addReg(CandI.getOperand(1).getReg());
7585 CopyFound = true;
7586 }
7587 break;
7588 }
7589 }
7590 if (!CopyFound) {
7591 // SCC def is not a copy
7592 // Insert a trivial select instead of creating a copy, because a copy from
7593 // SCC would semantically mean just copying a single bit, but we may need
7594 // the result to be a vector condition mask that needs preserving.
7595 unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
7596 : AMDGPU::S_CSELECT_B32;
7597 auto NewSelect =
7598 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7599 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7600 }
7601 }
7602
7603 Register NewDestReg = MRI.createVirtualRegister(
7604 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7605 MachineInstr *NewInst;
7606 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7607 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7608 .addImm(0)
7609 .add(Src1) // False
7610 .addImm(0)
7611 .add(Src0) // True
7612 .addReg(NewCondReg);
7613 } else {
7614 NewInst =
7615 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7616 .add(Src1) // False
7617 .add(Src0) // True
7618 .addReg(NewCondReg);
7619 }
7620 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7621 legalizeOperands(*NewInst, MDT);
7622 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7623}
7624
7625void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7626 MachineInstr &Inst) const {
7627 MachineBasicBlock &MBB = *Inst.getParent();
7629 MachineBasicBlock::iterator MII = Inst;
7630 DebugLoc DL = Inst.getDebugLoc();
7631
7632 MachineOperand &Dest = Inst.getOperand(0);
7633 MachineOperand &Src = Inst.getOperand(1);
7634 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7635 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7636
7637 unsigned SubOp = ST.hasAddNoCarry() ?
7638 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7639
7640 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7641 .addImm(0)
7642 .addReg(Src.getReg());
7643
7644 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7645 .addReg(Src.getReg())
7646 .addReg(TmpReg);
7647
7648 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7649 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7650}
7651
7652void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7653 MachineInstr &Inst) const {
7654 MachineBasicBlock &MBB = *Inst.getParent();
7656 MachineBasicBlock::iterator MII = Inst;
7657 const DebugLoc &DL = Inst.getDebugLoc();
7658
7659 MachineOperand &Dest = Inst.getOperand(0);
7660 MachineOperand &Src0 = Inst.getOperand(1);
7661 MachineOperand &Src1 = Inst.getOperand(2);
7662
7663 if (ST.hasDLInsts()) {
7664 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7665 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7666 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7667
7668 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7669 .add(Src0)
7670 .add(Src1);
7671
7672 MRI.replaceRegWith(Dest.getReg(), NewDest);
7673 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7674 } else {
7675 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7676 // invert either source and then perform the XOR. If either source is a
7677 // scalar register, then we can leave the inversion on the scalar unit to
7678 // achieve a better distribution of scalar and vector instructions.
7679 bool Src0IsSGPR = Src0.isReg() &&
7680 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7681 bool Src1IsSGPR = Src1.isReg() &&
7682 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7684 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7685 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7686
7687 // Build a pair of scalar instructions and add them to the work list.
7688 // The next iteration over the work list will lower these to the vector
7689 // unit as necessary.
7690 if (Src0IsSGPR) {
7691 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7692 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7693 .addReg(Temp)
7694 .add(Src1);
7695 } else if (Src1IsSGPR) {
7696 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7697 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7698 .add(Src0)
7699 .addReg(Temp);
7700 } else {
7701 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7702 .add(Src0)
7703 .add(Src1);
7704 MachineInstr *Not =
7705 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7706 Worklist.insert(Not);
7707 }
7708
7709 MRI.replaceRegWith(Dest.getReg(), NewDest);
7710
7711 Worklist.insert(Xor);
7712
7713 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7714 }
7715}
7716
7717void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7718 MachineInstr &Inst,
7719 unsigned Opcode) const {
7720 MachineBasicBlock &MBB = *Inst.getParent();
7722 MachineBasicBlock::iterator MII = Inst;
7723 const DebugLoc &DL = Inst.getDebugLoc();
7724
7725 MachineOperand &Dest = Inst.getOperand(0);
7726 MachineOperand &Src0 = Inst.getOperand(1);
7727 MachineOperand &Src1 = Inst.getOperand(2);
7728
7729 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7730 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7731
7732 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7733 .add(Src0)
7734 .add(Src1);
7735
7736 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7737 .addReg(Interm);
7738
7739 Worklist.insert(&Op);
7740 Worklist.insert(&Not);
7741
7742 MRI.replaceRegWith(Dest.getReg(), NewDest);
7743 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7744}
7745
7746void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7747 MachineInstr &Inst,
7748 unsigned Opcode) const {
7749 MachineBasicBlock &MBB = *Inst.getParent();
7751 MachineBasicBlock::iterator MII = Inst;
7752 const DebugLoc &DL = Inst.getDebugLoc();
7753
7754 MachineOperand &Dest = Inst.getOperand(0);
7755 MachineOperand &Src0 = Inst.getOperand(1);
7756 MachineOperand &Src1 = Inst.getOperand(2);
7757
7758 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7759 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7760
7761 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7762 .add(Src1);
7763
7764 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7765 .add(Src0)
7766 .addReg(Interm);
7767
7768 Worklist.insert(&Not);
7769 Worklist.insert(&Op);
7770
7771 MRI.replaceRegWith(Dest.getReg(), NewDest);
7772 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7773}
7774
7775void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
7776 MachineInstr &Inst, unsigned Opcode,
7777 bool Swap) const {
7778 MachineBasicBlock &MBB = *Inst.getParent();
7780
7781 MachineOperand &Dest = Inst.getOperand(0);
7782 MachineOperand &Src0 = Inst.getOperand(1);
7783 DebugLoc DL = Inst.getDebugLoc();
7784
7785 MachineBasicBlock::iterator MII = Inst;
7786
7787 const MCInstrDesc &InstDesc = get(Opcode);
7788 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7789 MRI.getRegClass(Src0.getReg()) :
7790 &AMDGPU::SGPR_32RegClass;
7791
7792 const TargetRegisterClass *Src0SubRC =
7793 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7794
7795 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7796 AMDGPU::sub0, Src0SubRC);
7797
7798 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7799 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7800 const TargetRegisterClass *NewDestSubRC =
7801 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7802
7803 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7804 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
7805
7806 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7807 AMDGPU::sub1, Src0SubRC);
7808
7809 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
7810 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
7811
7812 if (Swap)
7813 std::swap(DestSub0, DestSub1);
7814
7815 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
7816 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7817 .addReg(DestSub0)
7818 .addImm(AMDGPU::sub0)
7819 .addReg(DestSub1)
7820 .addImm(AMDGPU::sub1);
7821
7822 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7823
7824 Worklist.insert(&LoHalf);
7825 Worklist.insert(&HiHalf);
7826
7827 // We don't need to legalizeOperands here because for a single operand, src0
7828 // will support any kind of input.
7829
7830 // Move all users of this moved value.
7831 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7832}
7833
7834// There is not a vector equivalent of s_mul_u64. For this reason, we need to
7835// split the s_mul_u64 in 32-bit vector multiplications.
7836void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
7837 MachineInstr &Inst,
7838 MachineDominatorTree *MDT) const {
7839 MachineBasicBlock &MBB = *Inst.getParent();
7841
7842 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7843 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7844 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7845
7846 MachineOperand &Dest = Inst.getOperand(0);
7847 MachineOperand &Src0 = Inst.getOperand(1);
7848 MachineOperand &Src1 = Inst.getOperand(2);
7849 const DebugLoc &DL = Inst.getDebugLoc();
7850 MachineBasicBlock::iterator MII = Inst;
7851
7852 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7853 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7854 const TargetRegisterClass *Src0SubRC =
7855 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7856 if (RI.isSGPRClass(Src0SubRC))
7857 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7858 const TargetRegisterClass *Src1SubRC =
7859 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7860 if (RI.isSGPRClass(Src1SubRC))
7861 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7862
7863 // First, we extract the low 32-bit and high 32-bit values from each of the
7864 // operands.
7865 MachineOperand Op0L =
7866 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7867 MachineOperand Op1L =
7868 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7869 MachineOperand Op0H =
7870 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
7871 MachineOperand Op1H =
7872 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
7873
7874 // The multilication is done as follows:
7875 //
7876 // Op1H Op1L
7877 // * Op0H Op0L
7878 // --------------------
7879 // Op1H*Op0L Op1L*Op0L
7880 // + Op1H*Op0H Op1L*Op0H
7881 // -----------------------------------------
7882 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
7883 //
7884 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
7885 // value and that would overflow.
7886 // The low 32-bit value is Op1L*Op0L.
7887 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
7888
7889 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7890 MachineInstr *Op1L_Op0H =
7891 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
7892 .add(Op1L)
7893 .add(Op0H);
7894
7895 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7896 MachineInstr *Op1H_Op0L =
7897 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
7898 .add(Op1H)
7899 .add(Op0L);
7900
7901 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7902 MachineInstr *Carry =
7903 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
7904 .add(Op1L)
7905 .add(Op0L);
7906
7907 MachineInstr *LoHalf =
7908 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7909 .add(Op1L)
7910 .add(Op0L);
7911
7912 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7913 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
7914 .addReg(Op1L_Op0H_Reg)
7915 .addReg(Op1H_Op0L_Reg);
7916
7917 MachineInstr *HiHalf =
7918 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
7919 .addReg(AddReg)
7920 .addReg(CarryReg);
7921
7922 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7923 .addReg(DestSub0)
7924 .addImm(AMDGPU::sub0)
7925 .addReg(DestSub1)
7926 .addImm(AMDGPU::sub1);
7927
7928 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7929
7930 // Try to legalize the operands in case we need to swap the order to keep it
7931 // valid.
7932 legalizeOperands(*Op1L_Op0H, MDT);
7933 legalizeOperands(*Op1H_Op0L, MDT);
7934 legalizeOperands(*Carry, MDT);
7935 legalizeOperands(*LoHalf, MDT);
7936 legalizeOperands(*Add, MDT);
7937 legalizeOperands(*HiHalf, MDT);
7938
7939 // Move all users of this moved value.
7940 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7941}
7942
7943// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
7944// multiplications.
7945void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
7946 MachineInstr &Inst,
7947 MachineDominatorTree *MDT) const {
7948 MachineBasicBlock &MBB = *Inst.getParent();
7950
7951 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7952 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7953 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7954
7955 MachineOperand &Dest = Inst.getOperand(0);
7956 MachineOperand &Src0 = Inst.getOperand(1);
7957 MachineOperand &Src1 = Inst.getOperand(2);
7958 const DebugLoc &DL = Inst.getDebugLoc();
7959 MachineBasicBlock::iterator MII = Inst;
7960
7961 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7962 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7963 const TargetRegisterClass *Src0SubRC =
7964 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7965 if (RI.isSGPRClass(Src0SubRC))
7966 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7967 const TargetRegisterClass *Src1SubRC =
7968 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7969 if (RI.isSGPRClass(Src1SubRC))
7970 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7971
7972 // First, we extract the low 32-bit and high 32-bit values from each of the
7973 // operands.
7974 MachineOperand Op0L =
7975 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7976 MachineOperand Op1L =
7977 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7978
7979 unsigned Opc = Inst.getOpcode();
7980 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
7981 ? AMDGPU::V_MUL_HI_U32_e64
7982 : AMDGPU::V_MUL_HI_I32_e64;
7983 MachineInstr *HiHalf =
7984 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
7985
7986 MachineInstr *LoHalf =
7987 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7988 .add(Op1L)
7989 .add(Op0L);
7990
7991 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7992 .addReg(DestSub0)
7993 .addImm(AMDGPU::sub0)
7994 .addReg(DestSub1)
7995 .addImm(AMDGPU::sub1);
7996
7997 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7998
7999 // Try to legalize the operands in case we need to swap the order to keep it
8000 // valid.
8001 legalizeOperands(*HiHalf, MDT);
8002 legalizeOperands(*LoHalf, MDT);
8003
8004 // Move all users of this moved value.
8005 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8006}
8007
8008void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8009 MachineInstr &Inst, unsigned Opcode,
8010 MachineDominatorTree *MDT) const {
8011 MachineBasicBlock &MBB = *Inst.getParent();
8013
8014 MachineOperand &Dest = Inst.getOperand(0);
8015 MachineOperand &Src0 = Inst.getOperand(1);
8016 MachineOperand &Src1 = Inst.getOperand(2);
8017 DebugLoc DL = Inst.getDebugLoc();
8018
8019 MachineBasicBlock::iterator MII = Inst;
8020
8021 const MCInstrDesc &InstDesc = get(Opcode);
8022 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8023 MRI.getRegClass(Src0.getReg()) :
8024 &AMDGPU::SGPR_32RegClass;
8025
8026 const TargetRegisterClass *Src0SubRC =
8027 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8028 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8029 MRI.getRegClass(Src1.getReg()) :
8030 &AMDGPU::SGPR_32RegClass;
8031
8032 const TargetRegisterClass *Src1SubRC =
8033 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8034
8035 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8036 AMDGPU::sub0, Src0SubRC);
8037 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8038 AMDGPU::sub0, Src1SubRC);
8039 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8040 AMDGPU::sub1, Src0SubRC);
8041 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8042 AMDGPU::sub1, Src1SubRC);
8043
8044 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8045 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8046 const TargetRegisterClass *NewDestSubRC =
8047 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8048
8049 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8050 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8051 .add(SrcReg0Sub0)
8052 .add(SrcReg1Sub0);
8053
8054 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8055 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8056 .add(SrcReg0Sub1)
8057 .add(SrcReg1Sub1);
8058
8059 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8060 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8061 .addReg(DestSub0)
8062 .addImm(AMDGPU::sub0)
8063 .addReg(DestSub1)
8064 .addImm(AMDGPU::sub1);
8065
8066 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8067
8068 Worklist.insert(&LoHalf);
8069 Worklist.insert(&HiHalf);
8070
8071 // Move all users of this moved value.
8072 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8073}
8074
8075void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8076 MachineInstr &Inst,
8077 MachineDominatorTree *MDT) const {
8078 MachineBasicBlock &MBB = *Inst.getParent();
8080
8081 MachineOperand &Dest = Inst.getOperand(0);
8082 MachineOperand &Src0 = Inst.getOperand(1);
8083 MachineOperand &Src1 = Inst.getOperand(2);
8084 const DebugLoc &DL = Inst.getDebugLoc();
8085
8086 MachineBasicBlock::iterator MII = Inst;
8087
8088 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8089
8090 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8091
8092 MachineOperand* Op0;
8093 MachineOperand* Op1;
8094
8095 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8096 Op0 = &Src0;
8097 Op1 = &Src1;
8098 } else {
8099 Op0 = &Src1;
8100 Op1 = &Src0;
8101 }
8102
8103 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8104 .add(*Op0);
8105
8106 Register NewDest = MRI.createVirtualRegister(DestRC);
8107
8108 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8109 .addReg(Interm)
8110 .add(*Op1);
8111
8112 MRI.replaceRegWith(Dest.getReg(), NewDest);
8113
8114 Worklist.insert(&Xor);
8115}
8116
8117void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8118 MachineInstr &Inst) const {
8119 MachineBasicBlock &MBB = *Inst.getParent();
8121
8122 MachineBasicBlock::iterator MII = Inst;
8123 const DebugLoc &DL = Inst.getDebugLoc();
8124
8125 MachineOperand &Dest = Inst.getOperand(0);
8126 MachineOperand &Src = Inst.getOperand(1);
8127
8128 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8129 const TargetRegisterClass *SrcRC = Src.isReg() ?
8130 MRI.getRegClass(Src.getReg()) :
8131 &AMDGPU::SGPR_32RegClass;
8132
8133 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8134 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8135
8136 const TargetRegisterClass *SrcSubRC =
8137 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8138
8139 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8140 AMDGPU::sub0, SrcSubRC);
8141 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8142 AMDGPU::sub1, SrcSubRC);
8143
8144 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8145
8146 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8147
8148 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8149
8150 // We don't need to legalize operands here. src0 for either instruction can be
8151 // an SGPR, and the second input is unused or determined here.
8152 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8153}
8154
8155void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8156 MachineInstr &Inst) const {
8157 MachineBasicBlock &MBB = *Inst.getParent();
8159 MachineBasicBlock::iterator MII = Inst;
8160 const DebugLoc &DL = Inst.getDebugLoc();
8161
8162 MachineOperand &Dest = Inst.getOperand(0);
8163 uint32_t Imm = Inst.getOperand(2).getImm();
8164 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8165 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8166
8167 (void) Offset;
8168
8169 // Only sext_inreg cases handled.
8170 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8171 Offset == 0 && "Not implemented");
8172
8173 if (BitWidth < 32) {
8174 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8175 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8176 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8177
8178 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8179 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8180 .addImm(0)
8181 .addImm(BitWidth);
8182
8183 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8184 .addImm(31)
8185 .addReg(MidRegLo);
8186
8187 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8188 .addReg(MidRegLo)
8189 .addImm(AMDGPU::sub0)
8190 .addReg(MidRegHi)
8191 .addImm(AMDGPU::sub1);
8192
8193 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8194 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8195 return;
8196 }
8197
8198 MachineOperand &Src = Inst.getOperand(1);
8199 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8200 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8201
8202 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8203 .addImm(31)
8204 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8205
8206 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8207 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8208 .addImm(AMDGPU::sub0)
8209 .addReg(TmpReg)
8210 .addImm(AMDGPU::sub1);
8211
8212 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8213 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8214}
8215
8216void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8217 MachineInstr &Inst, unsigned Opcode,
8218 MachineDominatorTree *MDT) const {
8219 // (S_FLBIT_I32_B64 hi:lo) ->
8220 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8221 // (S_FF1_I32_B64 hi:lo) ->
8222 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8223
8224 MachineBasicBlock &MBB = *Inst.getParent();
8226 MachineBasicBlock::iterator MII = Inst;
8227 const DebugLoc &DL = Inst.getDebugLoc();
8228
8229 MachineOperand &Dest = Inst.getOperand(0);
8230 MachineOperand &Src = Inst.getOperand(1);
8231
8232 const MCInstrDesc &InstDesc = get(Opcode);
8233
8234 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8235 unsigned OpcodeAdd =
8236 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8237
8238 const TargetRegisterClass *SrcRC =
8239 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8240 const TargetRegisterClass *SrcSubRC =
8241 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8242
8243 MachineOperand SrcRegSub0 =
8244 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8245 MachineOperand SrcRegSub1 =
8246 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8247
8248 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8249 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8250 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8251 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8252
8253 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8254
8255 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8256
8257 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8258 .addReg(IsCtlz ? MidReg1 : MidReg2)
8259 .addImm(32)
8260 .addImm(1); // enable clamp
8261
8262 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8263 .addReg(MidReg3)
8264 .addReg(IsCtlz ? MidReg2 : MidReg1);
8265
8266 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8267
8268 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8269}
8270
8271void SIInstrInfo::addUsersToMoveToVALUWorklist(
8273 SIInstrWorklist &Worklist) const {
8274 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8275 E = MRI.use_end(); I != E;) {
8276 MachineInstr &UseMI = *I->getParent();
8277
8278 unsigned OpNo = 0;
8279
8280 switch (UseMI.getOpcode()) {
8281 case AMDGPU::COPY:
8282 case AMDGPU::WQM:
8283 case AMDGPU::SOFT_WQM:
8284 case AMDGPU::STRICT_WWM:
8285 case AMDGPU::STRICT_WQM:
8286 case AMDGPU::REG_SEQUENCE:
8287 case AMDGPU::PHI:
8288 case AMDGPU::INSERT_SUBREG:
8289 break;
8290 default:
8291 OpNo = I.getOperandNo();
8292 break;
8293 }
8294
8295 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8296 Worklist.insert(&UseMI);
8297
8298 do {
8299 ++I;
8300 } while (I != E && I->getParent() == &UseMI);
8301 } else {
8302 ++I;
8303 }
8304 }
8305}
8306
8307void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8309 MachineInstr &Inst) const {
8310 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8312 MachineOperand &Src0 = Inst.getOperand(1);
8313 MachineOperand &Src1 = Inst.getOperand(2);
8314 const DebugLoc &DL = Inst.getDebugLoc();
8315
8316 switch (Inst.getOpcode()) {
8317 case AMDGPU::S_PACK_LL_B32_B16: {
8318 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8319 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8320
8321 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8322 // 0.
8323 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8324 .addImm(0xffff);
8325
8326 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8327 .addReg(ImmReg, RegState::Kill)
8328 .add(Src0);
8329
8330 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8331 .add(Src1)
8332 .addImm(16)
8333 .addReg(TmpReg, RegState::Kill);
8334 break;
8335 }
8336 case AMDGPU::S_PACK_LH_B32_B16: {
8337 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8338 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8339 .addImm(0xffff);
8340 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8341 .addReg(ImmReg, RegState::Kill)
8342 .add(Src0)
8343 .add(Src1);
8344 break;
8345 }
8346 case AMDGPU::S_PACK_HL_B32_B16: {
8347 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8348 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8349 .addImm(16)
8350 .add(Src0);
8351 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8352 .add(Src1)
8353 .addImm(16)
8354 .addReg(TmpReg, RegState::Kill);
8355 break;
8356 }
8357 case AMDGPU::S_PACK_HH_B32_B16: {
8358 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8359 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8360 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8361 .addImm(16)
8362 .add(Src0);
8363 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8364 .addImm(0xffff0000);
8365 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8366 .add(Src1)
8367 .addReg(ImmReg, RegState::Kill)
8368 .addReg(TmpReg, RegState::Kill);
8369 break;
8370 }
8371 default:
8372 llvm_unreachable("unhandled s_pack_* instruction");
8373 }
8374
8375 MachineOperand &Dest = Inst.getOperand(0);
8376 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8377 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8378}
8379
8380void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8381 MachineInstr &SCCDefInst,
8382 SIInstrWorklist &Worklist,
8383 Register NewCond) const {
8384
8385 // Ensure that def inst defines SCC, which is still live.
8386 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8387 !Op.isDead() && Op.getParent() == &SCCDefInst);
8388 SmallVector<MachineInstr *, 4> CopyToDelete;
8389 // This assumes that all the users of SCC are in the same block
8390 // as the SCC def.
8391 for (MachineInstr &MI : // Skip the def inst itself.
8392 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8393 SCCDefInst.getParent()->end())) {
8394 // Check if SCC is used first.
8395 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
8396 if (SCCIdx != -1) {
8397 if (MI.isCopy()) {
8398 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8399 Register DestReg = MI.getOperand(0).getReg();
8400
8401 MRI.replaceRegWith(DestReg, NewCond);
8402 CopyToDelete.push_back(&MI);
8403 } else {
8404
8405 if (NewCond.isValid())
8406 MI.getOperand(SCCIdx).setReg(NewCond);
8407
8408 Worklist.insert(&MI);
8409 }
8410 }
8411 // Exit if we find another SCC def.
8412 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8413 break;
8414 }
8415 for (auto &Copy : CopyToDelete)
8416 Copy->eraseFromParent();
8417}
8418
8419// Instructions that use SCC may be converted to VALU instructions. When that
8420// happens, the SCC register is changed to VCC_LO. The instruction that defines
8421// SCC must be changed to an instruction that defines VCC. This function makes
8422// sure that the instruction that defines SCC is added to the moveToVALU
8423// worklist.
8424void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8425 SIInstrWorklist &Worklist) const {
8426 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8427 // then there is nothing to do because the defining instruction has been
8428 // converted to a VALU already. If SCC then that instruction needs to be
8429 // converted to a VALU.
8430 for (MachineInstr &MI :
8431 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8432 SCCUseInst->getParent()->rend())) {
8433 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8434 break;
8435 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8436 Worklist.insert(&MI);
8437 break;
8438 }
8439 }
8440}
8441
8442const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8443 const MachineInstr &Inst) const {
8444 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8445
8446 switch (Inst.getOpcode()) {
8447 // For target instructions, getOpRegClass just returns the virtual register
8448 // class associated with the operand, so we need to find an equivalent VGPR
8449 // register class in order to move the instruction to the VALU.
8450 case AMDGPU::COPY:
8451 case AMDGPU::PHI:
8452 case AMDGPU::REG_SEQUENCE:
8453 case AMDGPU::INSERT_SUBREG:
8454 case AMDGPU::WQM:
8455 case AMDGPU::SOFT_WQM:
8456 case AMDGPU::STRICT_WWM:
8457 case AMDGPU::STRICT_WQM: {
8458 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8459 if (RI.isAGPRClass(SrcRC)) {
8460 if (RI.isAGPRClass(NewDstRC))
8461 return nullptr;
8462
8463 switch (Inst.getOpcode()) {
8464 case AMDGPU::PHI:
8465 case AMDGPU::REG_SEQUENCE:
8466 case AMDGPU::INSERT_SUBREG:
8467 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8468 break;
8469 default:
8470 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8471 }
8472
8473 if (!NewDstRC)
8474 return nullptr;
8475 } else {
8476 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8477 return nullptr;
8478
8479 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8480 if (!NewDstRC)
8481 return nullptr;
8482 }
8483
8484 return NewDstRC;
8485 }
8486 default:
8487 return NewDstRC;
8488 }
8489}
8490
8491// Find the one SGPR operand we are allowed to use.
8492Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8493 int OpIndices[3]) const {
8494 const MCInstrDesc &Desc = MI.getDesc();
8495
8496 // Find the one SGPR operand we are allowed to use.
8497 //
8498 // First we need to consider the instruction's operand requirements before
8499 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8500 // of VCC, but we are still bound by the constant bus requirement to only use
8501 // one.
8502 //
8503 // If the operand's class is an SGPR, we can never move it.
8504
8505 Register SGPRReg = findImplicitSGPRRead(MI);
8506 if (SGPRReg)
8507 return SGPRReg;
8508
8509 Register UsedSGPRs[3] = {Register()};
8510 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8511
8512 for (unsigned i = 0; i < 3; ++i) {
8513 int Idx = OpIndices[i];
8514 if (Idx == -1)
8515 break;
8516
8517 const MachineOperand &MO = MI.getOperand(Idx);
8518 if (!MO.isReg())
8519 continue;
8520
8521 // Is this operand statically required to be an SGPR based on the operand
8522 // constraints?
8523 const TargetRegisterClass *OpRC =
8524 RI.getRegClass(Desc.operands()[Idx].RegClass);
8525 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8526 if (IsRequiredSGPR)
8527 return MO.getReg();
8528
8529 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8530 Register Reg = MO.getReg();
8531 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8532 if (RI.isSGPRClass(RegRC))
8533 UsedSGPRs[i] = Reg;
8534 }
8535
8536 // We don't have a required SGPR operand, so we have a bit more freedom in
8537 // selecting operands to move.
8538
8539 // Try to select the most used SGPR. If an SGPR is equal to one of the
8540 // others, we choose that.
8541 //
8542 // e.g.
8543 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8544 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8545
8546 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8547 // prefer those.
8548
8549 if (UsedSGPRs[0]) {
8550 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8551 SGPRReg = UsedSGPRs[0];
8552 }
8553
8554 if (!SGPRReg && UsedSGPRs[1]) {
8555 if (UsedSGPRs[1] == UsedSGPRs[2])
8556 SGPRReg = UsedSGPRs[1];
8557 }
8558
8559 return SGPRReg;
8560}
8561
8563 unsigned OperandName) const {
8564 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8565 if (Idx == -1)
8566 return nullptr;
8567
8568 return &MI.getOperand(Idx);
8569}
8570
8576 return (Format << 44) |
8577 (1ULL << 56) | // RESOURCE_LEVEL = 1
8578 (3ULL << 60); // OOB_SELECT = 3
8579 }
8580
8581 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8582 if (ST.isAmdHsaOS()) {
8583 // Set ATC = 1. GFX9 doesn't have this bit.
8585 RsrcDataFormat |= (1ULL << 56);
8586
8587 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8588 // BTW, it disables TC L2 and therefore decreases performance.
8590 RsrcDataFormat |= (2ULL << 59);
8591 }
8592
8593 return RsrcDataFormat;
8594}
8595
8599 0xffffffff; // Size;
8600
8601 // GFX9 doesn't have ELEMENT_SIZE.
8603 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8604 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8605 }
8606
8607 // IndexStride = 64 / 32.
8608 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
8609 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8610
8611 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8612 // Clear them unless we want a huge stride.
8615 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8616
8617 return Rsrc23;
8618}
8619
8621 unsigned Opc = MI.getOpcode();
8622
8623 return isSMRD(Opc);
8624}
8625
8627 return get(Opc).mayLoad() &&
8628 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8629}
8630
8632 int &FrameIndex) const {
8633 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8634 if (!Addr || !Addr->isFI())
8635 return Register();
8636
8637 assert(!MI.memoperands_empty() &&
8638 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8639
8640 FrameIndex = Addr->getIndex();
8641 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8642}
8643
8645 int &FrameIndex) const {
8646 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8647 assert(Addr && Addr->isFI());
8648 FrameIndex = Addr->getIndex();
8649 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8650}
8651
8653 int &FrameIndex) const {
8654 if (!MI.mayLoad())
8655 return Register();
8656
8657 if (isMUBUF(MI) || isVGPRSpill(MI))
8658 return isStackAccess(MI, FrameIndex);
8659
8660 if (isSGPRSpill(MI))
8661 return isSGPRStackAccess(MI, FrameIndex);
8662
8663 return Register();
8664}
8665
8667 int &FrameIndex) const {
8668 if (!MI.mayStore())
8669 return Register();
8670
8671 if (isMUBUF(MI) || isVGPRSpill(MI))
8672 return isStackAccess(MI, FrameIndex);
8673
8674 if (isSGPRSpill(MI))
8675 return isSGPRStackAccess(MI, FrameIndex);
8676
8677 return Register();
8678}
8679
8681 unsigned Size = 0;
8683 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8684 while (++I != E && I->isInsideBundle()) {
8685 assert(!I->isBundle() && "No nested bundle!");
8687 }
8688
8689 return Size;
8690}
8691
8693 unsigned Opc = MI.getOpcode();
8695 unsigned DescSize = Desc.getSize();
8696
8697 // If we have a definitive size, we can use it. Otherwise we need to inspect
8698 // the operands to know the size.
8699 if (isFixedSize(MI)) {
8700 unsigned Size = DescSize;
8701
8702 // If we hit the buggy offset, an extra nop will be inserted in MC so
8703 // estimate the worst case.
8704 if (MI.isBranch() && ST.hasOffset3fBug())
8705 Size += 4;
8706
8707 return Size;
8708 }
8709
8710 // Instructions may have a 32-bit literal encoded after them. Check
8711 // operands that could ever be literals.
8712 if (isVALU(MI) || isSALU(MI)) {
8713 if (isDPP(MI))
8714 return DescSize;
8715 bool HasLiteral = false;
8716 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8717 const MachineOperand &Op = MI.getOperand(I);
8718 const MCOperandInfo &OpInfo = Desc.operands()[I];
8719 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8720 HasLiteral = true;
8721 break;
8722 }
8723 }
8724 return HasLiteral ? DescSize + 4 : DescSize;
8725 }
8726
8727 // Check whether we have extra NSA words.
8728 if (isMIMG(MI)) {
8729 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8730 if (VAddr0Idx < 0)
8731 return 8;
8732
8733 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8734 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8735 }
8736
8737 switch (Opc) {
8738 case TargetOpcode::BUNDLE:
8739 return getInstBundleSize(MI);
8740 case TargetOpcode::INLINEASM:
8741 case TargetOpcode::INLINEASM_BR: {
8742 const MachineFunction *MF = MI.getParent()->getParent();
8743 const char *AsmStr = MI.getOperand(0).getSymbolName();
8744 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8745 }
8746 default:
8747 if (MI.isMetaInstruction())
8748 return 0;
8749 return DescSize;
8750 }
8751}
8752
8754 if (!isFLAT(MI))
8755 return false;
8756
8757 if (MI.memoperands_empty())
8758 return true;
8759
8760 for (const MachineMemOperand *MMO : MI.memoperands()) {
8761 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8762 return true;
8763 }
8764 return false;
8765}
8766
8768 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
8769}
8770
8772 MachineBasicBlock *IfEnd) const {
8774 assert(TI != IfEntry->end());
8775
8776 MachineInstr *Branch = &(*TI);
8777 MachineFunction *MF = IfEntry->getParent();
8779
8780 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8781 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8782 MachineInstr *SIIF =
8783 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
8784 .add(Branch->getOperand(0))
8785 .add(Branch->getOperand(1));
8786 MachineInstr *SIEND =
8787 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
8788 .addReg(DstReg);
8789
8790 IfEntry->erase(TI);
8791 IfEntry->insert(IfEntry->end(), SIIF);
8792 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
8793 }
8794}
8795
8797 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
8799 // We expect 2 terminators, one conditional and one unconditional.
8800 assert(TI != LoopEnd->end());
8801
8802 MachineInstr *Branch = &(*TI);
8803 MachineFunction *MF = LoopEnd->getParent();
8805
8806 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8807
8808 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8809 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
8810 MachineInstrBuilder HeaderPHIBuilder =
8811 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
8812 for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) {
8813 if (PMBB == LoopEnd) {
8814 HeaderPHIBuilder.addReg(BackEdgeReg);
8815 } else {
8816 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
8817 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
8818 ZeroReg, 0);
8819 HeaderPHIBuilder.addReg(ZeroReg);
8820 }
8821 HeaderPHIBuilder.addMBB(PMBB);
8822 }
8823 MachineInstr *HeaderPhi = HeaderPHIBuilder;
8824 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
8825 get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
8826 .addReg(DstReg)
8827 .add(Branch->getOperand(0));
8828 MachineInstr *SILOOP =
8829 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
8830 .addReg(BackEdgeReg)
8831 .addMBB(LoopEntry);
8832
8833 LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
8834 LoopEnd->erase(TI);
8835 LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
8836 LoopEnd->insert(LoopEnd->end(), SILOOP);
8837 }
8838}
8839
8842 static const std::pair<int, const char *> TargetIndices[] = {
8843 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8844 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8845 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8846 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8847 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8848 return ArrayRef(TargetIndices);
8849}
8850
8851/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8852/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8855 const ScheduleDAG *DAG) const {
8856 return new GCNHazardRecognizer(DAG->MF);
8857}
8858
8859/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8860/// pass.
8863 return new GCNHazardRecognizer(MF);
8864}
8865
8866// Called during:
8867// - pre-RA scheduling and post-RA scheduling
8870 const ScheduleDAGMI *DAG) const {
8871 // Borrowed from Arm Target
8872 // We would like to restrict this hazard recognizer to only
8873 // post-RA scheduling; we can tell that we're post-RA because we don't
8874 // track VRegLiveness.
8875 if (!DAG->hasVRegLiveness())
8876 return new GCNHazardRecognizer(DAG->MF);
8878}
8879
8880std::pair<unsigned, unsigned>
8882 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
8883}
8884
8887 static const std::pair<unsigned, const char *> TargetFlags[] = {
8888 { MO_GOTPCREL, "amdgpu-gotprel" },
8889 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
8890 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
8891 { MO_REL32_LO, "amdgpu-rel32-lo" },
8892 { MO_REL32_HI, "amdgpu-rel32-hi" },
8893 { MO_ABS32_LO, "amdgpu-abs32-lo" },
8894 { MO_ABS32_HI, "amdgpu-abs32-hi" },
8895 };
8896
8897 return ArrayRef(TargetFlags);
8898}
8899
8902 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8903 {
8904 {MONoClobber, "amdgpu-noclobber"},
8905 {MOLastUse, "amdgpu-last-use"},
8906 };
8907
8908 return ArrayRef(TargetFlags);
8909}
8910
8912 const MachineFunction &MF) const {
8914 assert(SrcReg.isVirtual());
8915 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8916 return AMDGPU::WWM_COPY;
8917
8918 return AMDGPU::COPY;
8919}
8920
8922 Register Reg) const {
8923 // We need to handle instructions which may be inserted during register
8924 // allocation to handle the prolog. The initial prolog instruction may have
8925 // been separated from the start of the block by spills and copies inserted
8926 // needed by the prolog. However, the insertions for scalar registers can
8927 // always be placed at the BB top as they are independent of the exec mask
8928 // value.
8929 bool IsNullOrVectorRegister = true;
8930 if (Reg) {
8931 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8932 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
8933 }
8934
8935 uint16_t Opcode = MI.getOpcode();
8936 // FIXME: Copies inserted in the block prolog for live-range split should also
8937 // be included.
8938 return IsNullOrVectorRegister &&
8939 (isSpill(Opcode) || (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
8940 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
8941}
8942
8946 const DebugLoc &DL,
8947 Register DestReg) const {
8948 if (ST.hasAddNoCarry())
8949 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
8950
8952 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
8953 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
8954
8955 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8956 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8957}
8958
8961 const DebugLoc &DL,
8962 Register DestReg,
8963 RegScavenger &RS) const {
8964 if (ST.hasAddNoCarry())
8965 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
8966
8967 // If available, prefer to use vcc.
8968 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
8969 ? Register(RI.getVCC())
8971 *RI.getBoolRC(), I, /* RestoreAfter */ false,
8972 0, /* AllowSpill */ false);
8973
8974 // TODO: Users need to deal with this.
8975 if (!UnusedCarry.isValid())
8976 return MachineInstrBuilder();
8977
8978 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8979 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8980}
8981
8982bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
8983 switch (Opcode) {
8984 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
8985 case AMDGPU::SI_KILL_I1_TERMINATOR:
8986 return true;
8987 default:
8988 return false;
8989 }
8990}
8991
8993 switch (Opcode) {
8994 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
8995 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
8996 case AMDGPU::SI_KILL_I1_PSEUDO:
8997 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
8998 default:
8999 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9000 }
9001}
9002
9003bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9004 return Imm <= getMaxMUBUFImmOffset(ST);
9005}
9006
9008 // GFX12 field is non-negative 24-bit signed byte offset.
9009 const unsigned OffsetBits =
9010 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9011 return (1 << OffsetBits) - 1;
9012}
9013
9015 if (!ST.isWave32())
9016 return;
9017
9018 if (MI.isInlineAsm())
9019 return;
9020
9021 for (auto &Op : MI.implicit_operands()) {
9022 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9023 Op.setReg(AMDGPU::VCC_LO);
9024 }
9025}
9026
9028 if (!isSMRD(MI))
9029 return false;
9030
9031 // Check that it is using a buffer resource.
9032 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9033 if (Idx == -1) // e.g. s_memtime
9034 return false;
9035
9036 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9037 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9038}
9039
9040// Given Imm, split it into the values to put into the SOffset and ImmOffset
9041// fields in an MUBUF instruction. Return false if it is not possible (due to a
9042// hardware bug needing a workaround).
9043//
9044// The required alignment ensures that individual address components remain
9045// aligned if they are aligned to begin with. It also ensures that additional
9046// offsets within the given alignment can be added to the resulting ImmOffset.
9048 uint32_t &ImmOffset, Align Alignment) const {
9049 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9050 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9051 uint32_t Overflow = 0;
9052
9053 if (Imm > MaxImm) {
9054 if (Imm <= MaxImm + 64) {
9055 // Use an SOffset inline constant for 4..64
9056 Overflow = Imm - MaxImm;
9057 Imm = MaxImm;
9058 } else {
9059 // Try to keep the same value in SOffset for adjacent loads, so that
9060 // the corresponding register contents can be re-used.
9061 //
9062 // Load values with all low-bits (except for alignment bits) set into
9063 // SOffset, so that a larger range of values can be covered using
9064 // s_movk_i32.
9065 //
9066 // Atomic operations fail to work correctly when individual address
9067 // components are unaligned, even if their sum is aligned.
9068 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9069 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9070 Imm = Low;
9071 Overflow = High - Alignment.value();
9072 }
9073 }
9074
9075 if (Overflow > 0) {
9076 // There is a hardware bug in SI and CI which prevents address clamping in
9077 // MUBUF instructions from working correctly with SOffsets. The immediate
9078 // offset is unaffected.
9080 return false;
9081
9082 // It is not possible to set immediate in SOffset field on some targets.
9083 if (ST.hasRestrictedSOffset())
9084 return false;
9085 }
9086
9087 ImmOffset = Imm;
9088 SOffset = Overflow;
9089 return true;
9090}
9091
9092// Depending on the used address space and instructions, some immediate offsets
9093// are allowed and some are not.
9094// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9095// scratch instruction offsets can also be negative. On GFX12, offsets can be
9096// negative for all variants.
9097//
9098// There are several bugs related to these offsets:
9099// On gfx10.1, flat instructions that go into the global address space cannot
9100// use an offset.
9101//
9102// For scratch instructions, the address can be either an SGPR or a VGPR.
9103// The following offsets can be used, depending on the architecture (x means
9104// cannot be used):
9105// +----------------------------+------+------+
9106// | Address-Mode | SGPR | VGPR |
9107// +----------------------------+------+------+
9108// | gfx9 | | |
9109// | negative, 4-aligned offset | x | ok |
9110// | negative, unaligned offset | x | ok |
9111// +----------------------------+------+------+
9112// | gfx10 | | |
9113// | negative, 4-aligned offset | ok | ok |
9114// | negative, unaligned offset | ok | x |
9115// +----------------------------+------+------+
9116// | gfx10.3 | | |
9117// | negative, 4-aligned offset | ok | ok |
9118// | negative, unaligned offset | ok | ok |
9119// +----------------------------+------+------+
9120//
9121// This function ignores the addressing mode, so if an offset cannot be used in
9122// one addressing mode, it is considered illegal.
9123bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9124 uint64_t FlatVariant) const {
9125 // TODO: Should 0 be special cased?
9126 if (!ST.hasFlatInstOffsets())
9127 return false;
9128
9129 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9130 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9131 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9132 return false;
9133
9135 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9136 (Offset % 4) != 0) {
9137 return false;
9138 }
9139
9140 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9141 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9142 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9143}
9144
9145// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9146std::pair<int64_t, int64_t>
9147SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9148 uint64_t FlatVariant) const {
9149 int64_t RemainderOffset = COffsetVal;
9150 int64_t ImmField = 0;
9151
9152 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9153 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9154
9155 if (AllowNegative) {
9156 // Use signed division by a power of two to truncate towards 0.
9157 int64_t D = 1LL << NumBits;
9158 RemainderOffset = (COffsetVal / D) * D;
9159 ImmField = COffsetVal - RemainderOffset;
9160
9162 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9163 (ImmField % 4) != 0) {
9164 // Make ImmField a multiple of 4
9165 RemainderOffset += ImmField % 4;
9166 ImmField -= ImmField % 4;
9167 }
9168 } else if (COffsetVal >= 0) {
9169 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9170 RemainderOffset = COffsetVal - ImmField;
9171 }
9172
9173 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9174 assert(RemainderOffset + ImmField == COffsetVal);
9175 return {ImmField, RemainderOffset};
9176}
9177
9179 if (ST.hasNegativeScratchOffsetBug() &&
9180 FlatVariant == SIInstrFlags::FlatScratch)
9181 return false;
9182
9183 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9184}
9185
9186static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9187 switch (ST.getGeneration()) {
9188 default:
9189 break;
9192 return SIEncodingFamily::SI;
9195 return SIEncodingFamily::VI;
9202 }
9203 llvm_unreachable("Unknown subtarget generation!");
9204}
9205
9206bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9207 switch(MCOp) {
9208 // These opcodes use indirect register addressing so
9209 // they need special handling by codegen (currently missing).
9210 // Therefore it is too risky to allow these opcodes
9211 // to be selected by dpp combiner or sdwa peepholer.
9212 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9213 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9214 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9215 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9216 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9217 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9218 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9219 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9220 return true;
9221 default:
9222 return false;
9223 }
9224}
9225
9226int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9227 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9228
9229 unsigned Gen = subtargetEncodingFamily(ST);
9230
9231 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
9234
9235 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9236 // subtarget has UnpackedD16VMem feature.
9237 // TODO: remove this when we discard GFX80 encoding.
9238 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9240
9241 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9242 switch (ST.getGeneration()) {
9243 default:
9245 break;
9248 break;
9251 break;
9252 }
9253 }
9254
9255 if (isMAI(Opcode)) {
9256 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9257 if (MFMAOp != -1)
9258 Opcode = MFMAOp;
9259 }
9260
9261 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9262
9263 // -1 means that Opcode is already a native instruction.
9264 if (MCOp == -1)
9265 return Opcode;
9266
9267 if (ST.hasGFX90AInsts()) {
9268 uint16_t NMCOp = (uint16_t)-1;
9269 if (ST.hasGFX940Insts())
9271 if (NMCOp == (uint16_t)-1)
9273 if (NMCOp == (uint16_t)-1)
9275 if (NMCOp != (uint16_t)-1)
9276 MCOp = NMCOp;
9277 }
9278
9279 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9280 // no encoding in the given subtarget generation.
9281 if (MCOp == (uint16_t)-1)
9282 return -1;
9283
9284 if (isAsmOnlyOpcode(MCOp))
9285 return -1;
9286
9287 return MCOp;
9288}
9289
9290static
9292 assert(RegOpnd.isReg());
9293 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9294 getRegSubRegPair(RegOpnd);
9295}
9296
9299 assert(MI.isRegSequence());
9300 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9301 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9302 auto &RegOp = MI.getOperand(1 + 2 * I);
9303 return getRegOrUndef(RegOp);
9304 }
9306}
9307
9308// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9309// Following a subreg of reg:subreg isn't supported
9312 if (!RSR.SubReg)
9313 return false;
9314 switch (MI.getOpcode()) {
9315 default: break;
9316 case AMDGPU::REG_SEQUENCE:
9317 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9318 return true;
9319 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9320 case AMDGPU::INSERT_SUBREG:
9321 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9322 // inserted the subreg we're looking for
9323 RSR = getRegOrUndef(MI.getOperand(2));
9324 else { // the subreg in the rest of the reg
9325 auto R1 = getRegOrUndef(MI.getOperand(1));
9326 if (R1.SubReg) // subreg of subreg isn't supported
9327 return false;
9328 RSR.Reg = R1.Reg;
9329 }
9330 return true;
9331 }
9332 return false;
9333}
9334
9337 assert(MRI.isSSA());
9338 if (!P.Reg.isVirtual())
9339 return nullptr;
9340
9341 auto RSR = P;
9342 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9343 while (auto *MI = DefInst) {
9344 DefInst = nullptr;
9345 switch (MI->getOpcode()) {
9346 case AMDGPU::COPY:
9347 case AMDGPU::V_MOV_B32_e32: {
9348 auto &Op1 = MI->getOperand(1);
9349 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9350 if (Op1.isUndef())
9351 return nullptr;
9352 RSR = getRegSubRegPair(Op1);
9353 DefInst = MRI.getVRegDef(RSR.Reg);
9354 }
9355 break;
9356 }
9357 default:
9358 if (followSubRegDef(*MI, RSR)) {
9359 if (!RSR.Reg)
9360 return nullptr;
9361 DefInst = MRI.getVRegDef(RSR.Reg);
9362 }
9363 }
9364 if (!DefInst)
9365 return MI;
9366 }
9367 return nullptr;
9368}
9369
9371 Register VReg,
9372 const MachineInstr &DefMI,
9373 const MachineInstr &UseMI) {
9374 assert(MRI.isSSA() && "Must be run on SSA");
9375
9376 auto *TRI = MRI.getTargetRegisterInfo();
9377 auto *DefBB = DefMI.getParent();
9378
9379 // Don't bother searching between blocks, although it is possible this block
9380 // doesn't modify exec.
9381 if (UseMI.getParent() != DefBB)
9382 return true;
9383
9384 const int MaxInstScan = 20;
9385 int NumInst = 0;
9386
9387 // Stop scan at the use.
9388 auto E = UseMI.getIterator();
9389 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9390 if (I->isDebugInstr())
9391 continue;
9392
9393 if (++NumInst > MaxInstScan)
9394 return true;
9395
9396 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9397 return true;
9398 }
9399
9400 return false;
9401}
9402
9404 Register VReg,
9405 const MachineInstr &DefMI) {
9406 assert(MRI.isSSA() && "Must be run on SSA");
9407
9408 auto *TRI = MRI.getTargetRegisterInfo();
9409 auto *DefBB = DefMI.getParent();
9410
9411 const int MaxUseScan = 10;
9412 int NumUse = 0;
9413
9414 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9415 auto &UseInst = *Use.getParent();
9416 // Don't bother searching between blocks, although it is possible this block
9417 // doesn't modify exec.
9418 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9419 return true;
9420
9421 if (++NumUse > MaxUseScan)
9422 return true;
9423 }
9424
9425 if (NumUse == 0)
9426 return false;
9427
9428 const int MaxInstScan = 20;
9429 int NumInst = 0;
9430
9431 // Stop scan when we have seen all the uses.
9432 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9433 assert(I != DefBB->end());
9434
9435 if (I->isDebugInstr())
9436 continue;
9437
9438 if (++NumInst > MaxInstScan)
9439 return true;
9440
9441 for (const MachineOperand &Op : I->operands()) {
9442 // We don't check reg masks here as they're used only on calls:
9443 // 1. EXEC is only considered const within one BB
9444 // 2. Call should be a terminator instruction if present in a BB
9445
9446 if (!Op.isReg())
9447 continue;
9448
9449 Register Reg = Op.getReg();
9450 if (Op.isUse()) {
9451 if (Reg == VReg && --NumUse == 0)
9452 return false;
9453 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9454 return true;
9455 }
9456 }
9457}
9458
9461 const DebugLoc &DL, Register Src, Register Dst) const {
9462 auto Cur = MBB.begin();
9463 if (Cur != MBB.end())
9464 do {
9465 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9466 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9467 ++Cur;
9468 } while (Cur != MBB.end() && Cur != LastPHIIt);
9469
9470 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9471 Dst);
9472}
9473
9476 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9477 if (InsPt != MBB.end() &&
9478 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9479 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9480 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9481 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9482 InsPt++;
9483 return BuildMI(MBB, InsPt, DL,
9484 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9485 : AMDGPU::S_MOV_B64_term),
9486 Dst)
9487 .addReg(Src, 0, SrcSubReg)
9488 .addReg(AMDGPU::EXEC, RegState::Implicit);
9489 }
9490 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9491 Dst);
9492}
9493
9494bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9495
9498 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9499 VirtRegMap *VRM) const {
9500 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9501 //
9502 // %0:sreg_32 = COPY $m0
9503 //
9504 // We explicitly chose SReg_32 for the virtual register so such a copy might
9505 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9506 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9507 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9508 // TargetInstrInfo::foldMemoryOperand() is going to try.
9509 // A similar issue also exists with spilling and reloading $exec registers.
9510 //
9511 // To prevent that, constrain the %0 register class here.
9512 if (isFullCopyInstr(MI)) {
9513 Register DstReg = MI.getOperand(0).getReg();
9514 Register SrcReg = MI.getOperand(1).getReg();
9515 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9516 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9518 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9519 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9520 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9521 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9522 return nullptr;
9523 } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9524 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9525 return nullptr;
9526 }
9527 }
9528 }
9529
9530 return nullptr;
9531}
9532
9534 const MachineInstr &MI,
9535 unsigned *PredCost) const {
9536 if (MI.isBundle()) {
9538 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9539 unsigned Lat = 0, Count = 0;
9540 for (++I; I != E && I->isBundledWithPred(); ++I) {
9541 ++Count;
9542 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9543 }
9544 return Lat + Count - 1;
9545 }
9546
9547 return SchedModel.computeInstrLatency(&MI);
9548}
9549
9552 unsigned opcode = MI.getOpcode();
9553 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9554 auto IID = GI->getIntrinsicID();
9559
9560 switch (IID) {
9561 case Intrinsic::amdgcn_if:
9562 case Intrinsic::amdgcn_else:
9563 // FIXME: Uniform if second result
9564 break;
9565 }
9566
9568 }
9569
9570 // Loads from the private and flat address spaces are divergent, because
9571 // threads can execute the load instruction with the same inputs and get
9572 // different results.
9573 //
9574 // All other loads are not divergent, because if threads issue loads with the
9575 // same arguments, they will always get the same result.
9576 if (opcode == AMDGPU::G_LOAD) {
9577 if (MI.memoperands_empty())
9578 return InstructionUniformity::NeverUniform; // conservative assumption
9579
9580 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9581 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9582 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9583 })) {
9584 // At least one MMO in a non-global address space.
9586 }
9588 }
9589
9590 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9591 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9592 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9593 AMDGPU::isGenericAtomic(opcode)) {
9595 }
9597}
9598
9601
9602 if (isNeverUniform(MI))
9604
9605 unsigned opcode = MI.getOpcode();
9606 if (opcode == AMDGPU::V_READLANE_B32 ||
9607 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9608 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9610
9611 if (isCopyInstr(MI)) {
9612 const MachineOperand &srcOp = MI.getOperand(1);
9613 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9614 const TargetRegisterClass *regClass =
9615 RI.getPhysRegBaseClass(srcOp.getReg());
9618 }
9620 }
9621
9622 // GMIR handling
9623 if (MI.isPreISelOpcode())
9625
9626 // Atomics are divergent because they are executed sequentially: when an
9627 // atomic operation refers to the same address in each thread, then each
9628 // thread after the first sees the value written by the previous thread as
9629 // original value.
9630
9631 if (isAtomic(MI))
9633
9634 // Loads from the private and flat address spaces are divergent, because
9635 // threads can execute the load instruction with the same inputs and get
9636 // different results.
9637 if (isFLAT(MI) && MI.mayLoad()) {
9638 if (MI.memoperands_empty())
9639 return InstructionUniformity::NeverUniform; // conservative assumption
9640
9641 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9642 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9643 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9644 })) {
9645 // At least one MMO in a non-global address space.
9647 }
9648
9650 }
9651
9652 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9653 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9654
9655 // FIXME: It's conceptually broken to report this for an instruction, and not
9656 // a specific def operand. For inline asm in particular, there could be mixed
9657 // uniform and divergent results.
9658 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9659 const MachineOperand &SrcOp = MI.getOperand(I);
9660 if (!SrcOp.isReg())
9661 continue;
9662
9663 Register Reg = SrcOp.getReg();
9664 if (!Reg || !SrcOp.readsReg())
9665 continue;
9666
9667 // If RegBank is null, this is unassigned or an unallocatable special
9668 // register, which are all scalars.
9669 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9670 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9672 }
9673
9674 // TODO: Uniformity check condtions above can be rearranged for more
9675 // redability
9676
9677 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9678 // currently turned into no-op COPYs by SelectionDAG ISel and are
9679 // therefore no longer recognizable.
9680
9682}
9683
9685 switch (MF.getFunction().getCallingConv()) {
9687 return 1;
9689 return 2;
9691 return 3;
9695 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9698 case CallingConv::C:
9699 case CallingConv::Fast:
9700 default:
9701 // Assume other calling conventions are various compute callable functions
9702 return 0;
9703 }
9704}
9705
9707 Register &SrcReg2, int64_t &CmpMask,
9708 int64_t &CmpValue) const {
9709 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9710 return false;
9711
9712 switch (MI.getOpcode()) {
9713 default:
9714 break;
9715 case AMDGPU::S_CMP_EQ_U32:
9716 case AMDGPU::S_CMP_EQ_I32:
9717 case AMDGPU::S_CMP_LG_U32:
9718 case AMDGPU::S_CMP_LG_I32:
9719 case AMDGPU::S_CMP_LT_U32:
9720 case AMDGPU::S_CMP_LT_I32:
9721 case AMDGPU::S_CMP_GT_U32:
9722 case AMDGPU::S_CMP_GT_I32:
9723 case AMDGPU::S_CMP_LE_U32:
9724 case AMDGPU::S_CMP_LE_I32:
9725 case AMDGPU::S_CMP_GE_U32:
9726 case AMDGPU::S_CMP_GE_I32:
9727 case AMDGPU::S_CMP_EQ_U64:
9728 case AMDGPU::S_CMP_LG_U64:
9729 SrcReg = MI.getOperand(0).getReg();
9730 if (MI.getOperand(1).isReg()) {
9731 if (MI.getOperand(1).getSubReg())
9732 return false;
9733 SrcReg2 = MI.getOperand(1).getReg();
9734 CmpValue = 0;
9735 } else if (MI.getOperand(1).isImm()) {
9736 SrcReg2 = Register();
9737 CmpValue = MI.getOperand(1).getImm();
9738 } else {
9739 return false;
9740 }
9741 CmpMask = ~0;
9742 return true;
9743 case AMDGPU::S_CMPK_EQ_U32:
9744 case AMDGPU::S_CMPK_EQ_I32:
9745 case AMDGPU::S_CMPK_LG_U32:
9746 case AMDGPU::S_CMPK_LG_I32:
9747 case AMDGPU::S_CMPK_LT_U32:
9748 case AMDGPU::S_CMPK_LT_I32:
9749 case AMDGPU::S_CMPK_GT_U32:
9750 case AMDGPU::S_CMPK_GT_I32:
9751 case AMDGPU::S_CMPK_LE_U32:
9752 case AMDGPU::S_CMPK_LE_I32:
9753 case AMDGPU::S_CMPK_GE_U32:
9754 case AMDGPU::S_CMPK_GE_I32:
9755 SrcReg = MI.getOperand(0).getReg();
9756 SrcReg2 = Register();
9757 CmpValue = MI.getOperand(1).getImm();
9758 CmpMask = ~0;
9759 return true;
9760 }
9761
9762 return false;
9763}
9764
9766 Register SrcReg2, int64_t CmpMask,
9767 int64_t CmpValue,
9768 const MachineRegisterInfo *MRI) const {
9769 if (!SrcReg || SrcReg.isPhysical())
9770 return false;
9771
9772 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9773 return false;
9774
9775 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9776 this](int64_t ExpectedValue, unsigned SrcSize,
9777 bool IsReversible, bool IsSigned) -> bool {
9778 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9779 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9780 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9781 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9782 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9783 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9784 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9785 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9786 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9787 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9788 //
9789 // Signed ge/gt are not used for the sign bit.
9790 //
9791 // If result of the AND is unused except in the compare:
9792 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9793 //
9794 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9795 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9796 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9797 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9798 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9799 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9800
9801 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9802 if (!Def || Def->getParent() != CmpInstr.getParent())
9803 return false;
9804
9805 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9806 Def->getOpcode() != AMDGPU::S_AND_B64)
9807 return false;
9808
9809 int64_t Mask;
9810 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9811 if (MO->isImm())
9812 Mask = MO->getImm();
9813 else if (!getFoldableImm(MO, Mask))
9814 return false;
9815 Mask &= maxUIntN(SrcSize);
9816 return isPowerOf2_64(Mask);
9817 };
9818
9819 MachineOperand *SrcOp = &Def->getOperand(1);
9820 if (isMask(SrcOp))
9821 SrcOp = &Def->getOperand(2);
9822 else if (isMask(&Def->getOperand(2)))
9823 SrcOp = &Def->getOperand(1);
9824 else
9825 return false;
9826
9827 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
9828 if (IsSigned && BitNo == SrcSize - 1)
9829 return false;
9830
9831 ExpectedValue <<= BitNo;
9832
9833 bool IsReversedCC = false;
9834 if (CmpValue != ExpectedValue) {
9835 if (!IsReversible)
9836 return false;
9837 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
9838 if (!IsReversedCC)
9839 return false;
9840 }
9841
9842 Register DefReg = Def->getOperand(0).getReg();
9843 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
9844 return false;
9845
9846 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
9847 I != E; ++I) {
9848 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
9849 I->killsRegister(AMDGPU::SCC, &RI))
9850 return false;
9851 }
9852
9853 MachineOperand *SccDef =
9854 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
9855 SccDef->setIsDead(false);
9856 CmpInstr.eraseFromParent();
9857
9858 if (!MRI->use_nodbg_empty(DefReg)) {
9859 assert(!IsReversedCC);
9860 return true;
9861 }
9862
9863 // Replace AND with unused result with a S_BITCMP.
9864 MachineBasicBlock *MBB = Def->getParent();
9865
9866 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
9867 : AMDGPU::S_BITCMP1_B32
9868 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
9869 : AMDGPU::S_BITCMP1_B64;
9870
9871 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
9872 .add(*SrcOp)
9873 .addImm(BitNo);
9874 Def->eraseFromParent();
9875
9876 return true;
9877 };
9878
9879 switch (CmpInstr.getOpcode()) {
9880 default:
9881 break;
9882 case AMDGPU::S_CMP_EQ_U32:
9883 case AMDGPU::S_CMP_EQ_I32:
9884 case AMDGPU::S_CMPK_EQ_U32:
9885 case AMDGPU::S_CMPK_EQ_I32:
9886 return optimizeCmpAnd(1, 32, true, false);
9887 case AMDGPU::S_CMP_GE_U32:
9888 case AMDGPU::S_CMPK_GE_U32:
9889 return optimizeCmpAnd(1, 32, false, false);
9890 case AMDGPU::S_CMP_GE_I32:
9891 case AMDGPU::S_CMPK_GE_I32:
9892 return optimizeCmpAnd(1, 32, false, true);
9893 case AMDGPU::S_CMP_EQ_U64:
9894 return optimizeCmpAnd(1, 64, true, false);
9895 case AMDGPU::S_CMP_LG_U32:
9896 case AMDGPU::S_CMP_LG_I32:
9897 case AMDGPU::S_CMPK_LG_U32:
9898 case AMDGPU::S_CMPK_LG_I32:
9899 return optimizeCmpAnd(0, 32, true, false);
9900 case AMDGPU::S_CMP_GT_U32:
9901 case AMDGPU::S_CMPK_GT_U32:
9902 return optimizeCmpAnd(0, 32, false, false);
9903 case AMDGPU::S_CMP_GT_I32:
9904 case AMDGPU::S_CMPK_GT_I32:
9905 return optimizeCmpAnd(0, 32, false, true);
9906 case AMDGPU::S_CMP_LG_U64:
9907 return optimizeCmpAnd(0, 64, true, false);
9908 }
9909
9910 return false;
9911}
9912
9914 unsigned OpName) const {
9915 if (!ST.needsAlignedVGPRs())
9916 return;
9917
9918 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
9919 if (OpNo < 0)
9920 return;
9921 MachineOperand &Op = MI.getOperand(OpNo);
9922 if (getOpSize(MI, OpNo) > 4)
9923 return;
9924
9925 // Add implicit aligned super-reg to force alignment on the data operand.
9926 const DebugLoc &DL = MI.getDebugLoc();
9927 MachineBasicBlock *BB = MI.getParent();
9929 Register DataReg = Op.getReg();
9930 bool IsAGPR = RI.isAGPR(MRI, DataReg);
9931 Register Undef = MRI.createVirtualRegister(
9932 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
9933 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
9934 Register NewVR =
9935 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
9936 : &AMDGPU::VReg_64_Align2RegClass);
9937 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
9938 .addReg(DataReg, 0, Op.getSubReg())
9939 .addImm(AMDGPU::sub0)
9940 .addReg(Undef)
9941 .addImm(AMDGPU::sub1);
9942 Op.setReg(NewVR);
9943 Op.setSubReg(AMDGPU::sub0);
9944 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
9945}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
amdgpu AMDGPU Register Bank Select
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
uint64_t High
#define P(N)
TargetInstrInfo::RegSubRegPair RegSubRegPair
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillRestoreOpcode(unsigned Size)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillSaveOpcode(unsigned Size)
static bool resultDependsOnExec(const MachineInstr &MI)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:85
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static constexpr unsigned ModifierOpNames[]
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:76
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:76
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:735
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:739
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:950
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:378
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:614
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:755
bool hasMAIInsts() const
Definition: GCNSubtarget.h:805
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:265
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:285
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:751
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:670
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:743
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:331
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
Generation getGeneration() const
Definition: GCNSubtarget.h:304
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:875
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:722
bool hasAddr64() const
Definition: GCNSubtarget.h:368
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:714
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all of the successor blocks of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:541
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:611
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:621
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:194
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:444
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:438
unsigned short Opcode
Definition: MCInstrDesc.h:205
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:565
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:97
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:91
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:397
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:47
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineDomTreeNode * addNewBlock(MachineBasicBlock *BB, MachineBasicBlock *DomBB)
addNewBlock - Add a new node to the dominator tree information.
bool properlyDominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
void changeImmediateDominator(MachineBasicBlock *N, MachineBasicBlock *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:558
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:341
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:561
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
iterator_range< mop_iterator > explicit_operands()
Definition: MachineInstr.h:680
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:804
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:789
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:771
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:487
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:688
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:568
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:386
int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
bool addRegisterDead(Register Reg, const TargetRegisterInfo *RegInfo, bool AddIfNotFound=false)
We have determined MI defined a register without a use.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:792
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
bool isNonUniformBranchInstr(MachineInstr &Instr) const
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:504
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:924
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1105
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1233
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:408
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:496
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:604
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:520
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
bool isSpill(uint16_t Opcode) const
Definition: SIInstrInfo.h:740
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:424
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:464
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:947
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:760
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void materializeImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, int64_t Value) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:716
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:970
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description.
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
void convertNonUniformIfRegion(MachineBasicBlock *IfEntry, MachineBasicBlock *IfEnd) const
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
Whether we must prevent this instruction from executing with EXEC = 0.
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:681
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:863
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:728
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1246
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:880
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:562
unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:66
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:68
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:240
SlotIndexes pass.
Definition: SlotIndexes.h:300
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:523
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1524
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isDPALU_DPP(const MCInstrDesc &OpDesc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1525
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1527
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isHi(unsigned Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2INT32
Definition: SIDefines.h:227
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_AC_V2FP16
Definition: SIDefines.h:246
@ OPERAND_REG_IMM_INT32
Operands with register or 32-bit immediate.
Definition: SIDefines.h:200
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16_DEFERRED
Definition: SIDefines.h:207
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:206
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_AC_BF16
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_AC_INT16
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:204
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_AC_FP16
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_AC_INT32
Definition: SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:242
@ OPERAND_REG_INLINE_AC_V2BF16
Definition: SIDefines.h:245
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:213
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:203
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:251
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:243
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:202
@ OPERAND_REG_INLINE_C_V2FP32
Definition: SIDefines.h:228
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_FP32_DEFERRED
Definition: SIDefines.h:209
@ OPERAND_REG_IMM_FP16_DEFERRED
Definition: SIDefines.h:208
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:409
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:411
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:408
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:410
@ TI_CONSTDATA_START
Definition: AMDGPU.h:407
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1526
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:70
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:68
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:67
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:69
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:60
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:59
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:66
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:71
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:456
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1415
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:244
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Error
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:203
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:216
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:85
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:49
MachineInstr * top() const
Definition: SIInstrInfo.h:54
bool empty() const
Definition: SIInstrInfo.h:64
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:73
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.