LLVM 19.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm {
44namespace AMDGPU {
45#define GET_D16ImageDimIntrinsics_IMPL
46#define GET_ImageDimIntrinsicTable_IMPL
47#define GET_RsrcIntrinsics_IMPL
48#include "AMDGPUGenSearchableTables.inc"
49}
50}
51
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
85static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
86 unsigned Opc0 = N0->getMachineOpcode();
87 unsigned Opc1 = N1->getMachineOpcode();
88
89 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
90 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
91
92 if (Op0Idx == -1 && Op1Idx == -1)
93 return true;
94
95
96 if ((Op0Idx == -1 && Op1Idx != -1) ||
97 (Op1Idx == -1 && Op0Idx != -1))
98 return false;
99
100 // getNamedOperandIdx returns the index for the MachineInstr's operands,
101 // which includes the result as the first operand. We are indexing into the
102 // MachineSDNode's operands, so we need to skip the result operand to get
103 // the real index.
104 --Op0Idx;
105 --Op1Idx;
106
107 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
108}
109
110static bool canRemat(const MachineInstr &MI) {
111
115 return true;
116
117 if (SIInstrInfo::isSMRD(MI)) {
118 return !MI.memoperands_empty() &&
119 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
120 return MMO->isLoad() && MMO->isInvariant();
121 });
122 }
123
124 return false;
125}
126
128 const MachineInstr &MI) const {
129
130 if (canRemat(MI)) {
131 // Normally VALU use of exec would block the rematerialization, but that
132 // is OK in this case to have an implicit exec read as all VALU do.
133 // We really want all of the generic logic for this except for this.
134
135 // Another potential implicit use is mode register. The core logic of
136 // the RA will not attempt rematerialization if mode is set anywhere
137 // in the function, otherwise it is safe since mode is not changed.
138
139 // There is difference to generic method which does not allow
140 // rematerialization if there are virtual register uses. We allow this,
141 // therefore this method includes SOP instructions as well.
142 if (!MI.hasImplicitDef() &&
143 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
144 !MI.mayRaiseFPException())
145 return true;
146 }
147
149}
150
151// Returns true if the scalar result of a VALU instruction depends on exec.
153 // Ignore comparisons which are only used masked with exec.
154 // This allows some hoisting/sinking of VALU comparisons.
155 if (MI.isCompare()) {
156 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
157 Register DstReg = MI.getOperand(0).getReg();
158 if (!DstReg.isVirtual())
159 return true;
160 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
161 switch (Use.getOpcode()) {
162 case AMDGPU::S_AND_SAVEEXEC_B32:
163 case AMDGPU::S_AND_SAVEEXEC_B64:
164 break;
165 case AMDGPU::S_AND_B32:
166 case AMDGPU::S_AND_B64:
167 if (!Use.readsRegister(AMDGPU::EXEC))
168 return true;
169 break;
170 default:
171 return true;
172 }
173 }
174 return false;
175 }
176
177 switch (MI.getOpcode()) {
178 default:
179 break;
180 case AMDGPU::V_READFIRSTLANE_B32:
181 return true;
182 }
183
184 return false;
185}
186
188 // Any implicit use of exec by VALU is not a real register read.
189 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
191}
192
194 MachineBasicBlock *SuccToSinkTo,
195 MachineCycleInfo *CI) const {
196 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
197 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
198 return true;
199
200 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
201 // Check if sinking of MI would create temporal divergent use.
202 for (auto Op : MI.uses()) {
203 if (Op.isReg() && Op.getReg().isVirtual() &&
204 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
205 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
206
207 // SgprDef defined inside cycle
208 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
209 if (FromCycle == nullptr)
210 continue;
211
212 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
213 // Check if there is a FromCycle that contains SgprDef's basic block but
214 // does not contain SuccToSinkTo and also has divergent exit condition.
215 while (FromCycle && !FromCycle->contains(ToCycle)) {
216 // After structurize-cfg, there should be exactly one cycle exit.
218 FromCycle->getExitBlocks(ExitBlocks);
219 assert(ExitBlocks.size() == 1);
220 assert(ExitBlocks[0]->getSinglePredecessor());
221
222 // FromCycle has divergent exit condition.
223 if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) {
224 return false;
225 }
226
227 FromCycle = FromCycle->getParentCycle();
228 }
229 }
230 }
231
232 return true;
233}
234
236 int64_t &Offset0,
237 int64_t &Offset1) const {
238 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
239 return false;
240
241 unsigned Opc0 = Load0->getMachineOpcode();
242 unsigned Opc1 = Load1->getMachineOpcode();
243
244 // Make sure both are actually loads.
245 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
246 return false;
247
248 // A mayLoad instruction without a def is not a load. Likely a prefetch.
249 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
250 return false;
251
252 if (isDS(Opc0) && isDS(Opc1)) {
253
254 // FIXME: Handle this case:
255 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
256 return false;
257
258 // Check base reg.
259 if (Load0->getOperand(0) != Load1->getOperand(0))
260 return false;
261
262 // Skip read2 / write2 variants for simplicity.
263 // TODO: We should report true if the used offsets are adjacent (excluded
264 // st64 versions).
265 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
266 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
267 if (Offset0Idx == -1 || Offset1Idx == -1)
268 return false;
269
270 // XXX - be careful of dataless loads
271 // getNamedOperandIdx returns the index for MachineInstrs. Since they
272 // include the output in the operand list, but SDNodes don't, we need to
273 // subtract the index by one.
274 Offset0Idx -= get(Opc0).NumDefs;
275 Offset1Idx -= get(Opc1).NumDefs;
276 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
277 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
278 return true;
279 }
280
281 if (isSMRD(Opc0) && isSMRD(Opc1)) {
282 // Skip time and cache invalidation instructions.
283 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
284 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
285 return false;
286
287 unsigned NumOps = getNumOperandsNoGlue(Load0);
288 if (NumOps != getNumOperandsNoGlue(Load1))
289 return false;
290
291 // Check base reg.
292 if (Load0->getOperand(0) != Load1->getOperand(0))
293 return false;
294
295 // Match register offsets, if both register and immediate offsets present.
296 assert(NumOps == 4 || NumOps == 5);
297 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
298 return false;
299
300 const ConstantSDNode *Load0Offset =
301 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
302 const ConstantSDNode *Load1Offset =
303 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
304
305 if (!Load0Offset || !Load1Offset)
306 return false;
307
308 Offset0 = Load0Offset->getZExtValue();
309 Offset1 = Load1Offset->getZExtValue();
310 return true;
311 }
312
313 // MUBUF and MTBUF can access the same addresses.
314 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
315
316 // MUBUF and MTBUF have vaddr at different indices.
317 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
318 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
319 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
320 return false;
321
322 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
323 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
324
325 if (OffIdx0 == -1 || OffIdx1 == -1)
326 return false;
327
328 // getNamedOperandIdx returns the index for MachineInstrs. Since they
329 // include the output in the operand list, but SDNodes don't, we need to
330 // subtract the index by one.
331 OffIdx0 -= get(Opc0).NumDefs;
332 OffIdx1 -= get(Opc1).NumDefs;
333
334 SDValue Off0 = Load0->getOperand(OffIdx0);
335 SDValue Off1 = Load1->getOperand(OffIdx1);
336
337 // The offset might be a FrameIndexSDNode.
338 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
339 return false;
340
341 Offset0 = Off0->getAsZExtVal();
342 Offset1 = Off1->getAsZExtVal();
343 return true;
344 }
345
346 return false;
347}
348
349static bool isStride64(unsigned Opc) {
350 switch (Opc) {
351 case AMDGPU::DS_READ2ST64_B32:
352 case AMDGPU::DS_READ2ST64_B64:
353 case AMDGPU::DS_WRITE2ST64_B32:
354 case AMDGPU::DS_WRITE2ST64_B64:
355 return true;
356 default:
357 return false;
358 }
359}
360
363 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
364 const TargetRegisterInfo *TRI) const {
365 if (!LdSt.mayLoadOrStore())
366 return false;
367
368 unsigned Opc = LdSt.getOpcode();
369 OffsetIsScalable = false;
370 const MachineOperand *BaseOp, *OffsetOp;
371 int DataOpIdx;
372
373 if (isDS(LdSt)) {
374 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
375 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
376 if (OffsetOp) {
377 // Normal, single offset LDS instruction.
378 if (!BaseOp) {
379 // DS_CONSUME/DS_APPEND use M0 for the base address.
380 // TODO: find the implicit use operand for M0 and use that as BaseOp?
381 return false;
382 }
383 BaseOps.push_back(BaseOp);
384 Offset = OffsetOp->getImm();
385 // Get appropriate operand, and compute width accordingly.
386 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
387 if (DataOpIdx == -1)
388 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
389 Width = getOpSize(LdSt, DataOpIdx);
390 } else {
391 // The 2 offset instructions use offset0 and offset1 instead. We can treat
392 // these as a load with a single offset if the 2 offsets are consecutive.
393 // We will use this for some partially aligned loads.
394 const MachineOperand *Offset0Op =
395 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
396 const MachineOperand *Offset1Op =
397 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
398
399 unsigned Offset0 = Offset0Op->getImm() & 0xff;
400 unsigned Offset1 = Offset1Op->getImm() & 0xff;
401 if (Offset0 + 1 != Offset1)
402 return false;
403
404 // Each of these offsets is in element sized units, so we need to convert
405 // to bytes of the individual reads.
406
407 unsigned EltSize;
408 if (LdSt.mayLoad())
409 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
410 else {
411 assert(LdSt.mayStore());
412 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
413 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
414 }
415
416 if (isStride64(Opc))
417 EltSize *= 64;
418
419 BaseOps.push_back(BaseOp);
420 Offset = EltSize * Offset0;
421 // Get appropriate operand(s), and compute width accordingly.
422 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
423 if (DataOpIdx == -1) {
424 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
425 Width = getOpSize(LdSt, DataOpIdx);
426 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
427 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
428 } else {
429 Width = getOpSize(LdSt, DataOpIdx);
430 }
431 }
432 return true;
433 }
434
435 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
436 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
437 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
438 return false;
439 BaseOps.push_back(RSrc);
440 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
441 if (BaseOp && !BaseOp->isFI())
442 BaseOps.push_back(BaseOp);
443 const MachineOperand *OffsetImm =
444 getNamedOperand(LdSt, AMDGPU::OpName::offset);
445 Offset = OffsetImm->getImm();
446 const MachineOperand *SOffset =
447 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
448 if (SOffset) {
449 if (SOffset->isReg())
450 BaseOps.push_back(SOffset);
451 else
452 Offset += SOffset->getImm();
453 }
454 // Get appropriate operand, and compute width accordingly.
455 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
456 if (DataOpIdx == -1)
457 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
458 if (DataOpIdx == -1) // LDS DMA
459 return false;
460 Width = getOpSize(LdSt, DataOpIdx);
461 return true;
462 }
463
464 if (isMIMG(LdSt)) {
465 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
466 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
467 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
468 if (VAddr0Idx >= 0) {
469 // GFX10 possible NSA encoding.
470 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
471 BaseOps.push_back(&LdSt.getOperand(I));
472 } else {
473 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
474 }
475 Offset = 0;
476 // Get appropriate operand, and compute width accordingly.
477 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
478 Width = getOpSize(LdSt, DataOpIdx);
479 return true;
480 }
481
482 if (isSMRD(LdSt)) {
483 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
484 if (!BaseOp) // e.g. S_MEMTIME
485 return false;
486 BaseOps.push_back(BaseOp);
487 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
488 Offset = OffsetOp ? OffsetOp->getImm() : 0;
489 // Get appropriate operand, and compute width accordingly.
490 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
491 if (DataOpIdx == -1)
492 return false;
493 Width = getOpSize(LdSt, DataOpIdx);
494 return true;
495 }
496
497 if (isFLAT(LdSt)) {
498 // Instructions have either vaddr or saddr or both or none.
499 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
500 if (BaseOp)
501 BaseOps.push_back(BaseOp);
502 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
503 if (BaseOp)
504 BaseOps.push_back(BaseOp);
505 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
506 // Get appropriate operand, and compute width accordingly.
507 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
508 if (DataOpIdx == -1)
509 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
510 if (DataOpIdx == -1) // LDS DMA
511 return false;
512 Width = getOpSize(LdSt, DataOpIdx);
513 return true;
514 }
515
516 return false;
517}
518
519static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
521 const MachineInstr &MI2,
523 // Only examine the first "base" operand of each instruction, on the
524 // assumption that it represents the real base address of the memory access.
525 // Other operands are typically offsets or indices from this base address.
526 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
527 return true;
528
529 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
530 return false;
531
532 auto MO1 = *MI1.memoperands_begin();
533 auto MO2 = *MI2.memoperands_begin();
534 if (MO1->getAddrSpace() != MO2->getAddrSpace())
535 return false;
536
537 auto Base1 = MO1->getValue();
538 auto Base2 = MO2->getValue();
539 if (!Base1 || !Base2)
540 return false;
541 Base1 = getUnderlyingObject(Base1);
542 Base2 = getUnderlyingObject(Base2);
543
544 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
545 return false;
546
547 return Base1 == Base2;
548}
549
551 int64_t Offset1, bool OffsetIsScalable1,
553 int64_t Offset2, bool OffsetIsScalable2,
554 unsigned ClusterSize,
555 unsigned NumBytes) const {
556 // If the mem ops (to be clustered) do not have the same base ptr, then they
557 // should not be clustered
558 if (!BaseOps1.empty() && !BaseOps2.empty()) {
559 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
560 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
561 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
562 return false;
563 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
564 // If only one base op is empty, they do not have the same base ptr
565 return false;
566 }
567
568 // In order to avoid register pressure, on an average, the number of DWORDS
569 // loaded together by all clustered mem ops should not exceed 8. This is an
570 // empirical value based on certain observations and performance related
571 // experiments.
572 // The good thing about this heuristic is - it avoids clustering of too many
573 // sub-word loads, and also avoids clustering of wide loads. Below is the
574 // brief summary of how the heuristic behaves for various `LoadSize`.
575 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
576 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
577 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
578 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
579 // (5) LoadSize >= 17: do not cluster
580 const unsigned LoadSize = NumBytes / ClusterSize;
581 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
582 return NumDWORDs <= 8;
583}
584
585// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
586// the first 16 loads will be interleaved with the stores, and the next 16 will
587// be clustered as expected. It should really split into 2 16 store batches.
588//
589// Loads are clustered until this returns false, rather than trying to schedule
590// groups of stores. This also means we have to deal with saying different
591// address space loads should be clustered, and ones which might cause bank
592// conflicts.
593//
594// This might be deprecated so it might not be worth that much effort to fix.
596 int64_t Offset0, int64_t Offset1,
597 unsigned NumLoads) const {
598 assert(Offset1 > Offset0 &&
599 "Second offset should be larger than first offset!");
600 // If we have less than 16 loads in a row, and the offsets are within 64
601 // bytes, then schedule together.
602
603 // A cacheline is 64 bytes (for global memory).
604 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
605}
606
609 const DebugLoc &DL, MCRegister DestReg,
610 MCRegister SrcReg, bool KillSrc,
611 const char *Msg = "illegal VGPR to SGPR copy") {
613 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
615 C.diagnose(IllegalCopy);
616
617 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
618 .addReg(SrcReg, getKillRegState(KillSrc));
619}
620
621/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
622/// possible to have a direct copy in these cases on GFX908, so an intermediate
623/// VGPR copy is required.
627 const DebugLoc &DL, MCRegister DestReg,
628 MCRegister SrcReg, bool KillSrc,
629 RegScavenger &RS, bool RegsOverlap,
630 Register ImpDefSuperReg = Register(),
631 Register ImpUseSuperReg = Register()) {
632 assert((TII.getSubtarget().hasMAIInsts() &&
633 !TII.getSubtarget().hasGFX90AInsts()) &&
634 "Expected GFX908 subtarget.");
635
636 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
637 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
638 "Source register of the copy should be either an SGPR or an AGPR.");
639
640 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
641 "Destination register of the copy should be an AGPR.");
642
643 const SIRegisterInfo &RI = TII.getRegisterInfo();
644
645 // First try to find defining accvgpr_write to avoid temporary registers.
646 // In the case of copies of overlapping AGPRs, we conservatively do not
647 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
648 // an accvgpr_write used for this same copy due to implicit-defs
649 if (!RegsOverlap) {
650 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
651 --Def;
652
653 if (!Def->modifiesRegister(SrcReg, &RI))
654 continue;
655
656 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
657 Def->getOperand(0).getReg() != SrcReg)
658 break;
659
660 MachineOperand &DefOp = Def->getOperand(1);
661 assert(DefOp.isReg() || DefOp.isImm());
662
663 if (DefOp.isReg()) {
664 bool SafeToPropagate = true;
665 // Check that register source operand is not clobbered before MI.
666 // Immediate operands are always safe to propagate.
667 for (auto I = Def; I != MI && SafeToPropagate; ++I)
668 if (I->modifiesRegister(DefOp.getReg(), &RI))
669 SafeToPropagate = false;
670
671 if (!SafeToPropagate)
672 break;
673
674 DefOp.setIsKill(false);
675 }
676
677 MachineInstrBuilder Builder =
678 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
679 .add(DefOp);
680 if (ImpDefSuperReg)
681 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
682
683 if (ImpUseSuperReg) {
684 Builder.addReg(ImpUseSuperReg,
686 }
687
688 return;
689 }
690 }
691
693 RS.backward(std::next(MI));
694
695 // Ideally we want to have three registers for a long reg_sequence copy
696 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
697 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
698 *MBB.getParent());
699
700 // Registers in the sequence are allocated contiguously so we can just
701 // use register number to pick one of three round-robin temps.
702 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
703 Register Tmp =
704 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
706 "VGPR used for an intermediate copy should have been reserved.");
707
708 // Only loop through if there are any free registers left. We don't want to
709 // spill.
710 while (RegNo--) {
711 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
712 /* RestoreAfter */ false, 0,
713 /* AllowSpill */ false);
714 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
715 break;
716 Tmp = Tmp2;
717 RS.setRegUsed(Tmp);
718 }
719
720 // Insert copy to temporary VGPR.
721 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
722 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
723 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
724 } else {
725 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
726 }
727
728 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
729 .addReg(SrcReg, getKillRegState(KillSrc));
730 if (ImpUseSuperReg) {
731 UseBuilder.addReg(ImpUseSuperReg,
733 }
734
735 MachineInstrBuilder DefBuilder
736 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
737 .addReg(Tmp, RegState::Kill);
738
739 if (ImpDefSuperReg)
740 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
741}
742
745 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
746 const TargetRegisterClass *RC, bool Forward) {
747 const SIRegisterInfo &RI = TII.getRegisterInfo();
748 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
750 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
751
752 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
753 int16_t SubIdx = BaseIndices[Idx];
754 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
755 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
756 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
757 unsigned Opcode = AMDGPU::S_MOV_B32;
758
759 // Is SGPR aligned? If so try to combine with next.
760 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
761 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
762 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
763 // Can use SGPR64 copy
764 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
765 SubIdx = RI.getSubRegFromChannel(Channel, 2);
766 DestSubReg = RI.getSubReg(DestReg, SubIdx);
767 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
768 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
769 Opcode = AMDGPU::S_MOV_B64;
770 Idx++;
771 }
772
773 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
774 .addReg(SrcSubReg)
775 .addReg(SrcReg, RegState::Implicit);
776
777 if (!FirstMI)
778 FirstMI = LastMI;
779
780 if (!Forward)
781 I--;
782 }
783
784 assert(FirstMI && LastMI);
785 if (!Forward)
786 std::swap(FirstMI, LastMI);
787
788 FirstMI->addOperand(
789 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
790
791 if (KillSrc)
792 LastMI->addRegisterKilled(SrcReg, &RI);
793}
794
797 const DebugLoc &DL, MCRegister DestReg,
798 MCRegister SrcReg, bool KillSrc) const {
799 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
800 unsigned Size = RI.getRegSizeInBits(*RC);
801 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
802 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
803
804 // The rest of copyPhysReg assumes Src and Dst size are the same size.
805 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
806 // we remove Fix16BitCopies and this code block?
807 if (Fix16BitCopies) {
808 if (((Size == 16) != (SrcSize == 16))) {
809 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
811 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
812 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
813 RegToFix = SubReg;
814
815 if (DestReg == SrcReg) {
816 // Identity copy. Insert empty bundle since ExpandPostRA expects an
817 // instruction here.
818 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
819 return;
820 }
821 RC = RI.getPhysRegBaseClass(DestReg);
822 Size = RI.getRegSizeInBits(*RC);
823 SrcRC = RI.getPhysRegBaseClass(SrcReg);
824 SrcSize = RI.getRegSizeInBits(*SrcRC);
825 }
826 }
827
828 if (RC == &AMDGPU::VGPR_32RegClass) {
829 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
830 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
831 AMDGPU::AGPR_32RegClass.contains(SrcReg));
832 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
833 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
834 BuildMI(MBB, MI, DL, get(Opc), DestReg)
835 .addReg(SrcReg, getKillRegState(KillSrc));
836 return;
837 }
838
839 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
840 RC == &AMDGPU::SReg_32RegClass) {
841 if (SrcReg == AMDGPU::SCC) {
842 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
843 .addImm(1)
844 .addImm(0);
845 return;
846 }
847
848 if (DestReg == AMDGPU::VCC_LO) {
849 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
850 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
851 .addReg(SrcReg, getKillRegState(KillSrc));
852 } else {
853 // FIXME: Hack until VReg_1 removed.
854 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
855 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
856 .addImm(0)
857 .addReg(SrcReg, getKillRegState(KillSrc));
858 }
859
860 return;
861 }
862
863 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
864 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
865 return;
866 }
867
868 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
869 .addReg(SrcReg, getKillRegState(KillSrc));
870 return;
871 }
872
873 if (RC == &AMDGPU::SReg_64RegClass) {
874 if (SrcReg == AMDGPU::SCC) {
875 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
876 .addImm(1)
877 .addImm(0);
878 return;
879 }
880
881 if (DestReg == AMDGPU::VCC) {
882 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 } else {
886 // FIXME: Hack until VReg_1 removed.
887 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
888 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
889 .addImm(0)
890 .addReg(SrcReg, getKillRegState(KillSrc));
891 }
892
893 return;
894 }
895
896 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
897 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
898 return;
899 }
900
901 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
904 }
905
906 if (DestReg == AMDGPU::SCC) {
907 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
908 // but SelectionDAG emits such copies for i1 sources.
909 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
910 // This copy can only be produced by patterns
911 // with explicit SCC, which are known to be enabled
912 // only for subtargets with S_CMP_LG_U64 present.
914 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
915 .addReg(SrcReg, getKillRegState(KillSrc))
916 .addImm(0);
917 } else {
918 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
919 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
920 .addReg(SrcReg, getKillRegState(KillSrc))
921 .addImm(0);
922 }
923
924 return;
925 }
926
927 if (RC == &AMDGPU::AGPR_32RegClass) {
928 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
929 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
930 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
931 .addReg(SrcReg, getKillRegState(KillSrc));
932 return;
933 }
934
935 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
936 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
937 .addReg(SrcReg, getKillRegState(KillSrc));
938 return;
939 }
940
941 // FIXME: Pass should maintain scavenger to avoid scan through the block on
942 // every AGPR spill.
943 RegScavenger RS;
944 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
945 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
946 return;
947 }
948
949 if (Size == 16) {
950 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
951 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
952 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
953
954 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
955 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
956 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
957 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
958 bool DstLow = !AMDGPU::isHi(DestReg, RI);
959 bool SrcLow = !AMDGPU::isHi(SrcReg, RI);
960 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
961 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
962
963 if (IsSGPRDst) {
964 if (!IsSGPRSrc) {
965 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
966 return;
967 }
968
969 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
970 .addReg(NewSrcReg, getKillRegState(KillSrc));
971 return;
972 }
973
974 if (IsAGPRDst || IsAGPRSrc) {
975 if (!DstLow || !SrcLow) {
976 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
977 "Cannot use hi16 subreg with an AGPR!");
978 }
979
980 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
981 return;
982 }
983
984 if (ST.hasTrue16BitInsts()) {
985 if (IsSGPRSrc) {
986 assert(SrcLow);
987 SrcReg = NewSrcReg;
988 }
989 // Use the smaller instruction encoding if possible.
990 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
991 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
992 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
993 .addReg(SrcReg);
994 } else {
995 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
996 .addImm(0) // src0_modifiers
997 .addReg(SrcReg)
998 .addImm(0); // op_sel
999 }
1000 return;
1001 }
1002
1003 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1004 if (!DstLow || !SrcLow) {
1005 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1006 "Cannot use hi16 subreg on VI!");
1007 }
1008
1009 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1010 .addReg(NewSrcReg, getKillRegState(KillSrc));
1011 return;
1012 }
1013
1014 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1015 .addImm(0) // src0_modifiers
1016 .addReg(NewSrcReg)
1017 .addImm(0) // clamp
1024 // First implicit operand is $exec.
1025 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1026 return;
1027 }
1028
1029 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1030 if (ST.hasMovB64()) {
1031 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1032 .addReg(SrcReg, getKillRegState(KillSrc));
1033 return;
1034 }
1035 if (ST.hasPkMovB32()) {
1036 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1038 .addReg(SrcReg)
1040 .addReg(SrcReg)
1041 .addImm(0) // op_sel_lo
1042 .addImm(0) // op_sel_hi
1043 .addImm(0) // neg_lo
1044 .addImm(0) // neg_hi
1045 .addImm(0) // clamp
1046 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1047 return;
1048 }
1049 }
1050
1051 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1052 if (RI.isSGPRClass(RC)) {
1053 if (!RI.isSGPRClass(SrcRC)) {
1054 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1055 return;
1056 }
1057 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1058 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1059 Forward);
1060 return;
1061 }
1062
1063 unsigned EltSize = 4;
1064 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1065 if (RI.isAGPRClass(RC)) {
1066 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1067 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1068 else if (RI.hasVGPRs(SrcRC) ||
1069 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1070 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1071 else
1072 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1073 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1074 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1075 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1076 (RI.isProperlyAlignedRC(*RC) &&
1077 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1078 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1079 if (ST.hasMovB64()) {
1080 Opcode = AMDGPU::V_MOV_B64_e32;
1081 EltSize = 8;
1082 } else if (ST.hasPkMovB32()) {
1083 Opcode = AMDGPU::V_PK_MOV_B32;
1084 EltSize = 8;
1085 }
1086 }
1087
1088 // For the cases where we need an intermediate instruction/temporary register
1089 // (destination is an AGPR), we need a scavenger.
1090 //
1091 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1092 // whole block for every handled copy.
1093 std::unique_ptr<RegScavenger> RS;
1094 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1095 RS.reset(new RegScavenger());
1096
1097 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1098
1099 // If there is an overlap, we can't kill the super-register on the last
1100 // instruction, since it will also kill the components made live by this def.
1101 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1102 const bool CanKillSuperReg = KillSrc && !Overlap;
1103
1104 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1105 unsigned SubIdx;
1106 if (Forward)
1107 SubIdx = SubIndices[Idx];
1108 else
1109 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1110 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1111 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1112 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1113
1114 bool IsFirstSubreg = Idx == 0;
1115 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1116
1117 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1118 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1119 Register ImpUseSuper = SrcReg;
1120 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1121 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1122 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1124 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1126 .addReg(SrcSubReg)
1128 .addReg(SrcSubReg)
1129 .addImm(0) // op_sel_lo
1130 .addImm(0) // op_sel_hi
1131 .addImm(0) // neg_lo
1132 .addImm(0) // neg_hi
1133 .addImm(0) // clamp
1134 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1135 if (IsFirstSubreg)
1137 } else {
1138 MachineInstrBuilder Builder =
1139 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1140 if (IsFirstSubreg)
1141 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1142
1143 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144 }
1145 }
1146}
1147
1148int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1149 int NewOpc;
1150
1151 // Try to map original to commuted opcode
1152 NewOpc = AMDGPU::getCommuteRev(Opcode);
1153 if (NewOpc != -1)
1154 // Check if the commuted (REV) opcode exists on the target.
1155 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1156
1157 // Try to map commuted to original opcode
1158 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1159 if (NewOpc != -1)
1160 // Check if the original (non-REV) opcode exists on the target.
1161 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1162
1163 return Opcode;
1164}
1165
1168 const DebugLoc &DL, Register DestReg,
1169 int64_t Value) const {
1171 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1172 if (RegClass == &AMDGPU::SReg_32RegClass ||
1173 RegClass == &AMDGPU::SGPR_32RegClass ||
1174 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1175 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1176 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1177 .addImm(Value);
1178 return;
1179 }
1180
1181 if (RegClass == &AMDGPU::SReg_64RegClass ||
1182 RegClass == &AMDGPU::SGPR_64RegClass ||
1183 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1184 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1185 .addImm(Value);
1186 return;
1187 }
1188
1189 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1190 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1191 .addImm(Value);
1192 return;
1193 }
1194 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1195 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1196 .addImm(Value);
1197 return;
1198 }
1199
1200 unsigned EltSize = 4;
1201 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1202 if (RI.isSGPRClass(RegClass)) {
1203 if (RI.getRegSizeInBits(*RegClass) > 32) {
1204 Opcode = AMDGPU::S_MOV_B64;
1205 EltSize = 8;
1206 } else {
1207 Opcode = AMDGPU::S_MOV_B32;
1208 EltSize = 4;
1209 }
1210 }
1211
1212 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1213 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1214 int64_t IdxValue = Idx == 0 ? Value : 0;
1215
1216 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1217 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1218 Builder.addImm(IdxValue);
1219 }
1220}
1221
1222const TargetRegisterClass *
1224 return &AMDGPU::VGPR_32RegClass;
1225}
1226
1229 const DebugLoc &DL, Register DstReg,
1231 Register TrueReg,
1232 Register FalseReg) const {
1234 const TargetRegisterClass *BoolXExecRC =
1235 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1236 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1237 "Not a VGPR32 reg");
1238
1239 if (Cond.size() == 1) {
1240 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1241 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1242 .add(Cond[0]);
1243 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1244 .addImm(0)
1245 .addReg(FalseReg)
1246 .addImm(0)
1247 .addReg(TrueReg)
1248 .addReg(SReg);
1249 } else if (Cond.size() == 2) {
1250 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1251 switch (Cond[0].getImm()) {
1252 case SIInstrInfo::SCC_TRUE: {
1253 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1254 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1255 : AMDGPU::S_CSELECT_B64), SReg)
1256 .addImm(1)
1257 .addImm(0);
1258 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1259 .addImm(0)
1260 .addReg(FalseReg)
1261 .addImm(0)
1262 .addReg(TrueReg)
1263 .addReg(SReg);
1264 break;
1265 }
1266 case SIInstrInfo::SCC_FALSE: {
1267 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1268 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1269 : AMDGPU::S_CSELECT_B64), SReg)
1270 .addImm(0)
1271 .addImm(1);
1272 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1273 .addImm(0)
1274 .addReg(FalseReg)
1275 .addImm(0)
1276 .addReg(TrueReg)
1277 .addReg(SReg);
1278 break;
1279 }
1280 case SIInstrInfo::VCCNZ: {
1281 MachineOperand RegOp = Cond[1];
1282 RegOp.setImplicit(false);
1283 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1284 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1285 .add(RegOp);
1286 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1287 .addImm(0)
1288 .addReg(FalseReg)
1289 .addImm(0)
1290 .addReg(TrueReg)
1291 .addReg(SReg);
1292 break;
1293 }
1294 case SIInstrInfo::VCCZ: {
1295 MachineOperand RegOp = Cond[1];
1296 RegOp.setImplicit(false);
1297 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1298 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1299 .add(RegOp);
1300 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1301 .addImm(0)
1302 .addReg(TrueReg)
1303 .addImm(0)
1304 .addReg(FalseReg)
1305 .addReg(SReg);
1306 break;
1307 }
1308 case SIInstrInfo::EXECNZ: {
1309 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1310 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1311 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1312 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1313 .addImm(0);
1314 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1315 : AMDGPU::S_CSELECT_B64), SReg)
1316 .addImm(1)
1317 .addImm(0);
1318 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1319 .addImm(0)
1320 .addReg(FalseReg)
1321 .addImm(0)
1322 .addReg(TrueReg)
1323 .addReg(SReg);
1324 break;
1325 }
1326 case SIInstrInfo::EXECZ: {
1327 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1328 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1329 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1330 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1331 .addImm(0);
1332 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1333 : AMDGPU::S_CSELECT_B64), SReg)
1334 .addImm(0)
1335 .addImm(1);
1336 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1337 .addImm(0)
1338 .addReg(FalseReg)
1339 .addImm(0)
1340 .addReg(TrueReg)
1341 .addReg(SReg);
1342 llvm_unreachable("Unhandled branch predicate EXECZ");
1343 break;
1344 }
1345 default:
1346 llvm_unreachable("invalid branch predicate");
1347 }
1348 } else {
1349 llvm_unreachable("Can only handle Cond size 1 or 2");
1350 }
1351}
1352
1355 const DebugLoc &DL,
1356 Register SrcReg, int Value) const {
1358 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1359 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1360 .addImm(Value)
1361 .addReg(SrcReg);
1362
1363 return Reg;
1364}
1365
1368 const DebugLoc &DL,
1369 Register SrcReg, int Value) const {
1371 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1372 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1373 .addImm(Value)
1374 .addReg(SrcReg);
1375
1376 return Reg;
1377}
1378
1380
1381 if (RI.isAGPRClass(DstRC))
1382 return AMDGPU::COPY;
1383 if (RI.getRegSizeInBits(*DstRC) == 16) {
1384 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1385 // before RA.
1386 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1387 } else if (RI.getRegSizeInBits(*DstRC) == 32) {
1388 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1389 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
1390 return AMDGPU::S_MOV_B64;
1391 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
1392 return AMDGPU::V_MOV_B64_PSEUDO;
1393 }
1394 return AMDGPU::COPY;
1395}
1396
1397const MCInstrDesc &
1399 bool IsIndirectSrc) const {
1400 if (IsIndirectSrc) {
1401 if (VecSize <= 32) // 4 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1403 if (VecSize <= 64) // 8 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1405 if (VecSize <= 96) // 12 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1407 if (VecSize <= 128) // 16 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1409 if (VecSize <= 160) // 20 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1411 if (VecSize <= 256) // 32 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1413 if (VecSize <= 288) // 36 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1415 if (VecSize <= 320) // 40 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1417 if (VecSize <= 352) // 44 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1419 if (VecSize <= 384) // 48 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1421 if (VecSize <= 512) // 64 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1423 if (VecSize <= 1024) // 128 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1425
1426 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1427 }
1428
1429 if (VecSize <= 32) // 4 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1431 if (VecSize <= 64) // 8 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1433 if (VecSize <= 96) // 12 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1435 if (VecSize <= 128) // 16 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1437 if (VecSize <= 160) // 20 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1439 if (VecSize <= 256) // 32 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1441 if (VecSize <= 288) // 36 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1443 if (VecSize <= 320) // 40 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1445 if (VecSize <= 352) // 44 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1447 if (VecSize <= 384) // 48 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1449 if (VecSize <= 512) // 64 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1451 if (VecSize <= 1024) // 128 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1453
1454 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1455}
1456
1457static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1458 if (VecSize <= 32) // 4 bytes
1459 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1460 if (VecSize <= 64) // 8 bytes
1461 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1462 if (VecSize <= 96) // 12 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1464 if (VecSize <= 128) // 16 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1466 if (VecSize <= 160) // 20 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1468 if (VecSize <= 256) // 32 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1470 if (VecSize <= 288) // 36 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1472 if (VecSize <= 320) // 40 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1474 if (VecSize <= 352) // 44 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1476 if (VecSize <= 384) // 48 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1478 if (VecSize <= 512) // 64 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1480 if (VecSize <= 1024) // 128 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1482
1483 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1484}
1485
1486static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1487 if (VecSize <= 32) // 4 bytes
1488 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1489 if (VecSize <= 64) // 8 bytes
1490 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1491 if (VecSize <= 96) // 12 bytes
1492 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1493 if (VecSize <= 128) // 16 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1495 if (VecSize <= 160) // 20 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1497 if (VecSize <= 256) // 32 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1499 if (VecSize <= 288) // 36 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1501 if (VecSize <= 320) // 40 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1503 if (VecSize <= 352) // 44 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1505 if (VecSize <= 384) // 48 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1507 if (VecSize <= 512) // 64 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1509 if (VecSize <= 1024) // 128 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1511
1512 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1513}
1514
1515static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1516 if (VecSize <= 64) // 8 bytes
1517 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1518 if (VecSize <= 128) // 16 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1520 if (VecSize <= 256) // 32 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1522 if (VecSize <= 512) // 64 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1524 if (VecSize <= 1024) // 128 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1526
1527 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1528}
1529
1530const MCInstrDesc &
1531SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1532 bool IsSGPR) const {
1533 if (IsSGPR) {
1534 switch (EltSize) {
1535 case 32:
1536 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1537 case 64:
1538 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1539 default:
1540 llvm_unreachable("invalid reg indexing elt size");
1541 }
1542 }
1543
1544 assert(EltSize == 32 && "invalid reg indexing elt size");
1546}
1547
1548static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1549 switch (Size) {
1550 case 4:
1551 return AMDGPU::SI_SPILL_S32_SAVE;
1552 case 8:
1553 return AMDGPU::SI_SPILL_S64_SAVE;
1554 case 12:
1555 return AMDGPU::SI_SPILL_S96_SAVE;
1556 case 16:
1557 return AMDGPU::SI_SPILL_S128_SAVE;
1558 case 20:
1559 return AMDGPU::SI_SPILL_S160_SAVE;
1560 case 24:
1561 return AMDGPU::SI_SPILL_S192_SAVE;
1562 case 28:
1563 return AMDGPU::SI_SPILL_S224_SAVE;
1564 case 32:
1565 return AMDGPU::SI_SPILL_S256_SAVE;
1566 case 36:
1567 return AMDGPU::SI_SPILL_S288_SAVE;
1568 case 40:
1569 return AMDGPU::SI_SPILL_S320_SAVE;
1570 case 44:
1571 return AMDGPU::SI_SPILL_S352_SAVE;
1572 case 48:
1573 return AMDGPU::SI_SPILL_S384_SAVE;
1574 case 64:
1575 return AMDGPU::SI_SPILL_S512_SAVE;
1576 case 128:
1577 return AMDGPU::SI_SPILL_S1024_SAVE;
1578 default:
1579 llvm_unreachable("unknown register size");
1580 }
1581}
1582
1583static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1584 switch (Size) {
1585 case 4:
1586 return AMDGPU::SI_SPILL_V32_SAVE;
1587 case 8:
1588 return AMDGPU::SI_SPILL_V64_SAVE;
1589 case 12:
1590 return AMDGPU::SI_SPILL_V96_SAVE;
1591 case 16:
1592 return AMDGPU::SI_SPILL_V128_SAVE;
1593 case 20:
1594 return AMDGPU::SI_SPILL_V160_SAVE;
1595 case 24:
1596 return AMDGPU::SI_SPILL_V192_SAVE;
1597 case 28:
1598 return AMDGPU::SI_SPILL_V224_SAVE;
1599 case 32:
1600 return AMDGPU::SI_SPILL_V256_SAVE;
1601 case 36:
1602 return AMDGPU::SI_SPILL_V288_SAVE;
1603 case 40:
1604 return AMDGPU::SI_SPILL_V320_SAVE;
1605 case 44:
1606 return AMDGPU::SI_SPILL_V352_SAVE;
1607 case 48:
1608 return AMDGPU::SI_SPILL_V384_SAVE;
1609 case 64:
1610 return AMDGPU::SI_SPILL_V512_SAVE;
1611 case 128:
1612 return AMDGPU::SI_SPILL_V1024_SAVE;
1613 default:
1614 llvm_unreachable("unknown register size");
1615 }
1616}
1617
1618static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1619 switch (Size) {
1620 case 4:
1621 return AMDGPU::SI_SPILL_A32_SAVE;
1622 case 8:
1623 return AMDGPU::SI_SPILL_A64_SAVE;
1624 case 12:
1625 return AMDGPU::SI_SPILL_A96_SAVE;
1626 case 16:
1627 return AMDGPU::SI_SPILL_A128_SAVE;
1628 case 20:
1629 return AMDGPU::SI_SPILL_A160_SAVE;
1630 case 24:
1631 return AMDGPU::SI_SPILL_A192_SAVE;
1632 case 28:
1633 return AMDGPU::SI_SPILL_A224_SAVE;
1634 case 32:
1635 return AMDGPU::SI_SPILL_A256_SAVE;
1636 case 36:
1637 return AMDGPU::SI_SPILL_A288_SAVE;
1638 case 40:
1639 return AMDGPU::SI_SPILL_A320_SAVE;
1640 case 44:
1641 return AMDGPU::SI_SPILL_A352_SAVE;
1642 case 48:
1643 return AMDGPU::SI_SPILL_A384_SAVE;
1644 case 64:
1645 return AMDGPU::SI_SPILL_A512_SAVE;
1646 case 128:
1647 return AMDGPU::SI_SPILL_A1024_SAVE;
1648 default:
1649 llvm_unreachable("unknown register size");
1650 }
1651}
1652
1653static unsigned getAVSpillSaveOpcode(unsigned Size) {
1654 switch (Size) {
1655 case 4:
1656 return AMDGPU::SI_SPILL_AV32_SAVE;
1657 case 8:
1658 return AMDGPU::SI_SPILL_AV64_SAVE;
1659 case 12:
1660 return AMDGPU::SI_SPILL_AV96_SAVE;
1661 case 16:
1662 return AMDGPU::SI_SPILL_AV128_SAVE;
1663 case 20:
1664 return AMDGPU::SI_SPILL_AV160_SAVE;
1665 case 24:
1666 return AMDGPU::SI_SPILL_AV192_SAVE;
1667 case 28:
1668 return AMDGPU::SI_SPILL_AV224_SAVE;
1669 case 32:
1670 return AMDGPU::SI_SPILL_AV256_SAVE;
1671 case 36:
1672 return AMDGPU::SI_SPILL_AV288_SAVE;
1673 case 40:
1674 return AMDGPU::SI_SPILL_AV320_SAVE;
1675 case 44:
1676 return AMDGPU::SI_SPILL_AV352_SAVE;
1677 case 48:
1678 return AMDGPU::SI_SPILL_AV384_SAVE;
1679 case 64:
1680 return AMDGPU::SI_SPILL_AV512_SAVE;
1681 case 128:
1682 return AMDGPU::SI_SPILL_AV1024_SAVE;
1683 default:
1684 llvm_unreachable("unknown register size");
1685 }
1686}
1687
1688static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1689 bool IsVectorSuperClass) {
1690 // Currently, there is only 32-bit WWM register spills needed.
1691 if (Size != 4)
1692 llvm_unreachable("unknown wwm register spill size");
1693
1694 if (IsVectorSuperClass)
1695 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1696
1697 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1698}
1699
1701 const TargetRegisterClass *RC,
1702 unsigned Size,
1703 const SIRegisterInfo &TRI,
1704 const SIMachineFunctionInfo &MFI) {
1705 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1706
1707 // Choose the right opcode if spilling a WWM register.
1709 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1710
1711 if (IsVectorSuperClass)
1712 return getAVSpillSaveOpcode(Size);
1713
1714 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1716}
1717
1720 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1721 const TargetRegisterInfo *TRI, Register VReg) const {
1724 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1725 const DebugLoc &DL = MBB.findDebugLoc(MI);
1726
1727 MachinePointerInfo PtrInfo
1728 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1730 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1731 FrameInfo.getObjectAlign(FrameIndex));
1732 unsigned SpillSize = TRI->getSpillSize(*RC);
1733
1735 if (RI.isSGPRClass(RC)) {
1736 MFI->setHasSpilledSGPRs();
1737 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1738 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1739 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1740
1741 // We are only allowed to create one new instruction when spilling
1742 // registers, so we need to use pseudo instruction for spilling SGPRs.
1743 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1744
1745 // The SGPR spill/restore instructions only work on number sgprs, so we need
1746 // to make sure we are using the correct register class.
1747 if (SrcReg.isVirtual() && SpillSize == 4) {
1748 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1749 }
1750
1751 BuildMI(MBB, MI, DL, OpDesc)
1752 .addReg(SrcReg, getKillRegState(isKill)) // data
1753 .addFrameIndex(FrameIndex) // addr
1754 .addMemOperand(MMO)
1756
1757 if (RI.spillSGPRToVGPR())
1758 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1759 return;
1760 }
1761
1762 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1763 SpillSize, RI, *MFI);
1764 MFI->setHasSpilledVGPRs();
1765
1766 BuildMI(MBB, MI, DL, get(Opcode))
1767 .addReg(SrcReg, getKillRegState(isKill)) // data
1768 .addFrameIndex(FrameIndex) // addr
1769 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1770 .addImm(0) // offset
1771 .addMemOperand(MMO);
1772}
1773
1774static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1775 switch (Size) {
1776 case 4:
1777 return AMDGPU::SI_SPILL_S32_RESTORE;
1778 case 8:
1779 return AMDGPU::SI_SPILL_S64_RESTORE;
1780 case 12:
1781 return AMDGPU::SI_SPILL_S96_RESTORE;
1782 case 16:
1783 return AMDGPU::SI_SPILL_S128_RESTORE;
1784 case 20:
1785 return AMDGPU::SI_SPILL_S160_RESTORE;
1786 case 24:
1787 return AMDGPU::SI_SPILL_S192_RESTORE;
1788 case 28:
1789 return AMDGPU::SI_SPILL_S224_RESTORE;
1790 case 32:
1791 return AMDGPU::SI_SPILL_S256_RESTORE;
1792 case 36:
1793 return AMDGPU::SI_SPILL_S288_RESTORE;
1794 case 40:
1795 return AMDGPU::SI_SPILL_S320_RESTORE;
1796 case 44:
1797 return AMDGPU::SI_SPILL_S352_RESTORE;
1798 case 48:
1799 return AMDGPU::SI_SPILL_S384_RESTORE;
1800 case 64:
1801 return AMDGPU::SI_SPILL_S512_RESTORE;
1802 case 128:
1803 return AMDGPU::SI_SPILL_S1024_RESTORE;
1804 default:
1805 llvm_unreachable("unknown register size");
1806 }
1807}
1808
1809static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1810 switch (Size) {
1811 case 4:
1812 return AMDGPU::SI_SPILL_V32_RESTORE;
1813 case 8:
1814 return AMDGPU::SI_SPILL_V64_RESTORE;
1815 case 12:
1816 return AMDGPU::SI_SPILL_V96_RESTORE;
1817 case 16:
1818 return AMDGPU::SI_SPILL_V128_RESTORE;
1819 case 20:
1820 return AMDGPU::SI_SPILL_V160_RESTORE;
1821 case 24:
1822 return AMDGPU::SI_SPILL_V192_RESTORE;
1823 case 28:
1824 return AMDGPU::SI_SPILL_V224_RESTORE;
1825 case 32:
1826 return AMDGPU::SI_SPILL_V256_RESTORE;
1827 case 36:
1828 return AMDGPU::SI_SPILL_V288_RESTORE;
1829 case 40:
1830 return AMDGPU::SI_SPILL_V320_RESTORE;
1831 case 44:
1832 return AMDGPU::SI_SPILL_V352_RESTORE;
1833 case 48:
1834 return AMDGPU::SI_SPILL_V384_RESTORE;
1835 case 64:
1836 return AMDGPU::SI_SPILL_V512_RESTORE;
1837 case 128:
1838 return AMDGPU::SI_SPILL_V1024_RESTORE;
1839 default:
1840 llvm_unreachable("unknown register size");
1841 }
1842}
1843
1844static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1845 switch (Size) {
1846 case 4:
1847 return AMDGPU::SI_SPILL_A32_RESTORE;
1848 case 8:
1849 return AMDGPU::SI_SPILL_A64_RESTORE;
1850 case 12:
1851 return AMDGPU::SI_SPILL_A96_RESTORE;
1852 case 16:
1853 return AMDGPU::SI_SPILL_A128_RESTORE;
1854 case 20:
1855 return AMDGPU::SI_SPILL_A160_RESTORE;
1856 case 24:
1857 return AMDGPU::SI_SPILL_A192_RESTORE;
1858 case 28:
1859 return AMDGPU::SI_SPILL_A224_RESTORE;
1860 case 32:
1861 return AMDGPU::SI_SPILL_A256_RESTORE;
1862 case 36:
1863 return AMDGPU::SI_SPILL_A288_RESTORE;
1864 case 40:
1865 return AMDGPU::SI_SPILL_A320_RESTORE;
1866 case 44:
1867 return AMDGPU::SI_SPILL_A352_RESTORE;
1868 case 48:
1869 return AMDGPU::SI_SPILL_A384_RESTORE;
1870 case 64:
1871 return AMDGPU::SI_SPILL_A512_RESTORE;
1872 case 128:
1873 return AMDGPU::SI_SPILL_A1024_RESTORE;
1874 default:
1875 llvm_unreachable("unknown register size");
1876 }
1877}
1878
1879static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1880 switch (Size) {
1881 case 4:
1882 return AMDGPU::SI_SPILL_AV32_RESTORE;
1883 case 8:
1884 return AMDGPU::SI_SPILL_AV64_RESTORE;
1885 case 12:
1886 return AMDGPU::SI_SPILL_AV96_RESTORE;
1887 case 16:
1888 return AMDGPU::SI_SPILL_AV128_RESTORE;
1889 case 20:
1890 return AMDGPU::SI_SPILL_AV160_RESTORE;
1891 case 24:
1892 return AMDGPU::SI_SPILL_AV192_RESTORE;
1893 case 28:
1894 return AMDGPU::SI_SPILL_AV224_RESTORE;
1895 case 32:
1896 return AMDGPU::SI_SPILL_AV256_RESTORE;
1897 case 36:
1898 return AMDGPU::SI_SPILL_AV288_RESTORE;
1899 case 40:
1900 return AMDGPU::SI_SPILL_AV320_RESTORE;
1901 case 44:
1902 return AMDGPU::SI_SPILL_AV352_RESTORE;
1903 case 48:
1904 return AMDGPU::SI_SPILL_AV384_RESTORE;
1905 case 64:
1906 return AMDGPU::SI_SPILL_AV512_RESTORE;
1907 case 128:
1908 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1909 default:
1910 llvm_unreachable("unknown register size");
1911 }
1912}
1913
1914static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1915 bool IsVectorSuperClass) {
1916 // Currently, there is only 32-bit WWM register spills needed.
1917 if (Size != 4)
1918 llvm_unreachable("unknown wwm register spill size");
1919
1920 if (IsVectorSuperClass)
1921 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1922
1923 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1924}
1925
1926static unsigned
1928 unsigned Size, const SIRegisterInfo &TRI,
1929 const SIMachineFunctionInfo &MFI) {
1930 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1931
1932 // Choose the right opcode if restoring a WWM register.
1934 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1935
1936 if (IsVectorSuperClass)
1938
1939 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1941}
1942
1945 Register DestReg, int FrameIndex,
1946 const TargetRegisterClass *RC,
1947 const TargetRegisterInfo *TRI,
1948 Register VReg) const {
1951 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1952 const DebugLoc &DL = MBB.findDebugLoc(MI);
1953 unsigned SpillSize = TRI->getSpillSize(*RC);
1954
1955 MachinePointerInfo PtrInfo
1956 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1957
1959 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1960 FrameInfo.getObjectAlign(FrameIndex));
1961
1962 if (RI.isSGPRClass(RC)) {
1963 MFI->setHasSpilledSGPRs();
1964 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1965 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1966 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1967
1968 // FIXME: Maybe this should not include a memoperand because it will be
1969 // lowered to non-memory instructions.
1970 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1971 if (DestReg.isVirtual() && SpillSize == 4) {
1973 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1974 }
1975
1976 if (RI.spillSGPRToVGPR())
1977 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1978 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1979 .addFrameIndex(FrameIndex) // addr
1980 .addMemOperand(MMO)
1982
1983 return;
1984 }
1985
1986 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1987 SpillSize, RI, *MFI);
1988 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1989 .addFrameIndex(FrameIndex) // vaddr
1990 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1991 .addImm(0) // offset
1992 .addMemOperand(MMO);
1993}
1994
1997 insertNoops(MBB, MI, 1);
1998}
1999
2002 unsigned Quantity) const {
2004 while (Quantity > 0) {
2005 unsigned Arg = std::min(Quantity, 8u);
2006 Quantity -= Arg;
2007 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2008 }
2009}
2010
2012 auto MF = MBB.getParent();
2014
2015 assert(Info->isEntryFunction());
2016
2017 if (MBB.succ_empty()) {
2018 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2019 if (HasNoTerminator) {
2020 if (Info->returnsVoid()) {
2021 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2022 } else {
2023 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2024 }
2025 }
2026 }
2027}
2028
2030 switch (MI.getOpcode()) {
2031 default:
2032 if (MI.isMetaInstruction())
2033 return 0;
2034 return 1; // FIXME: Do wait states equal cycles?
2035
2036 case AMDGPU::S_NOP:
2037 return MI.getOperand(0).getImm() + 1;
2038 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2039 // hazard, even if one exist, won't really be visible. Should we handle it?
2040 }
2041}
2042
2044 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2045 MachineBasicBlock &MBB = *MI.getParent();
2047 switch (MI.getOpcode()) {
2048 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2049 case AMDGPU::S_MOV_B64_term:
2050 // This is only a terminator to get the correct spill code placement during
2051 // register allocation.
2052 MI.setDesc(get(AMDGPU::S_MOV_B64));
2053 break;
2054
2055 case AMDGPU::S_MOV_B32_term:
2056 // This is only a terminator to get the correct spill code placement during
2057 // register allocation.
2058 MI.setDesc(get(AMDGPU::S_MOV_B32));
2059 break;
2060
2061 case AMDGPU::S_XOR_B64_term:
2062 // This is only a terminator to get the correct spill code placement during
2063 // register allocation.
2064 MI.setDesc(get(AMDGPU::S_XOR_B64));
2065 break;
2066
2067 case AMDGPU::S_XOR_B32_term:
2068 // This is only a terminator to get the correct spill code placement during
2069 // register allocation.
2070 MI.setDesc(get(AMDGPU::S_XOR_B32));
2071 break;
2072 case AMDGPU::S_OR_B64_term:
2073 // This is only a terminator to get the correct spill code placement during
2074 // register allocation.
2075 MI.setDesc(get(AMDGPU::S_OR_B64));
2076 break;
2077 case AMDGPU::S_OR_B32_term:
2078 // This is only a terminator to get the correct spill code placement during
2079 // register allocation.
2080 MI.setDesc(get(AMDGPU::S_OR_B32));
2081 break;
2082
2083 case AMDGPU::S_ANDN2_B64_term:
2084 // This is only a terminator to get the correct spill code placement during
2085 // register allocation.
2086 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2087 break;
2088
2089 case AMDGPU::S_ANDN2_B32_term:
2090 // This is only a terminator to get the correct spill code placement during
2091 // register allocation.
2092 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2093 break;
2094
2095 case AMDGPU::S_AND_B64_term:
2096 // This is only a terminator to get the correct spill code placement during
2097 // register allocation.
2098 MI.setDesc(get(AMDGPU::S_AND_B64));
2099 break;
2100
2101 case AMDGPU::S_AND_B32_term:
2102 // This is only a terminator to get the correct spill code placement during
2103 // register allocation.
2104 MI.setDesc(get(AMDGPU::S_AND_B32));
2105 break;
2106
2107 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2108 // This is only a terminator to get the correct spill code placement during
2109 // register allocation.
2110 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2111 break;
2112
2113 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2114 // This is only a terminator to get the correct spill code placement during
2115 // register allocation.
2116 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2117 break;
2118
2119 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2120 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2121 break;
2122
2123 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2124 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2125 break;
2126
2127 case AMDGPU::V_MOV_B64_PSEUDO: {
2128 Register Dst = MI.getOperand(0).getReg();
2129 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2130 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2131
2132 const MachineOperand &SrcOp = MI.getOperand(1);
2133 // FIXME: Will this work for 64-bit floating point immediates?
2134 assert(!SrcOp.isFPImm());
2135 if (ST.hasMovB64()) {
2136 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2137 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2138 isUInt<32>(SrcOp.getImm()))
2139 break;
2140 }
2141 if (SrcOp.isImm()) {
2142 APInt Imm(64, SrcOp.getImm());
2143 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2144 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2145 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2146 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2148 .addImm(Lo.getSExtValue())
2150 .addImm(Lo.getSExtValue())
2151 .addImm(0) // op_sel_lo
2152 .addImm(0) // op_sel_hi
2153 .addImm(0) // neg_lo
2154 .addImm(0) // neg_hi
2155 .addImm(0); // clamp
2156 } else {
2157 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2158 .addImm(Lo.getSExtValue())
2160 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2161 .addImm(Hi.getSExtValue())
2163 }
2164 } else {
2165 assert(SrcOp.isReg());
2166 if (ST.hasPkMovB32() &&
2167 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2168 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2169 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2170 .addReg(SrcOp.getReg())
2172 .addReg(SrcOp.getReg())
2173 .addImm(0) // op_sel_lo
2174 .addImm(0) // op_sel_hi
2175 .addImm(0) // neg_lo
2176 .addImm(0) // neg_hi
2177 .addImm(0); // clamp
2178 } else {
2179 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2180 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2182 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2183 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2185 }
2186 }
2187 MI.eraseFromParent();
2188 break;
2189 }
2190 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2192 break;
2193 }
2194 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2195 const MachineOperand &SrcOp = MI.getOperand(1);
2196 assert(!SrcOp.isFPImm());
2197 APInt Imm(64, SrcOp.getImm());
2198 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2199 MI.setDesc(get(AMDGPU::S_MOV_B64));
2200 break;
2201 }
2202
2203 Register Dst = MI.getOperand(0).getReg();
2204 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2205 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2206
2207 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2208 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2209 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2210 .addImm(Lo.getSExtValue())
2212 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2213 .addImm(Hi.getSExtValue())
2215 MI.eraseFromParent();
2216 break;
2217 }
2218 case AMDGPU::V_SET_INACTIVE_B32: {
2219 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2220 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2221 // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2222 // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2223 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2224 .add(MI.getOperand(1));
2225 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2226 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2227 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2228 .add(MI.getOperand(2));
2229 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2230 .addReg(Exec);
2231 MI.eraseFromParent();
2232 break;
2233 }
2234 case AMDGPU::V_SET_INACTIVE_B64: {
2235 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2236 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2237 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2238 MI.getOperand(0).getReg())
2239 .add(MI.getOperand(1));
2240 expandPostRAPseudo(*Copy);
2241 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2242 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2243 Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2244 MI.getOperand(0).getReg())
2245 .add(MI.getOperand(2));
2246 expandPostRAPseudo(*Copy);
2247 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2248 .addReg(Exec);
2249 MI.eraseFromParent();
2250 break;
2251 }
2252 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2253 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2254 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2255 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2256 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2257 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2258 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2259 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2260 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2261 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2262 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2263 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2264 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2265 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2266 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2267 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2268 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2269 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2270 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2271 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2272 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2273 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2274 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2275 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2276 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2280 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2281 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2282
2283 unsigned Opc;
2284 if (RI.hasVGPRs(EltRC)) {
2285 Opc = AMDGPU::V_MOVRELD_B32_e32;
2286 } else {
2287 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2288 : AMDGPU::S_MOVRELD_B32;
2289 }
2290
2291 const MCInstrDesc &OpDesc = get(Opc);
2292 Register VecReg = MI.getOperand(0).getReg();
2293 bool IsUndef = MI.getOperand(1).isUndef();
2294 unsigned SubReg = MI.getOperand(3).getImm();
2295 assert(VecReg == MI.getOperand(1).getReg());
2296
2298 BuildMI(MBB, MI, DL, OpDesc)
2299 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2300 .add(MI.getOperand(2))
2302 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2303
2304 const int ImpDefIdx =
2305 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2306 const int ImpUseIdx = ImpDefIdx + 1;
2307 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2308 MI.eraseFromParent();
2309 break;
2310 }
2311 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2317 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2318 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2319 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2320 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2321 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2322 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2324 Register VecReg = MI.getOperand(0).getReg();
2325 bool IsUndef = MI.getOperand(1).isUndef();
2326 Register Idx = MI.getOperand(3).getReg();
2327 Register SubReg = MI.getOperand(4).getImm();
2328
2329 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2330 .addReg(Idx)
2332 SetOn->getOperand(3).setIsUndef();
2333
2334 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2336 BuildMI(MBB, MI, DL, OpDesc)
2337 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2338 .add(MI.getOperand(2))
2340 .addReg(VecReg,
2341 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2342
2343 const int ImpDefIdx =
2344 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2345 const int ImpUseIdx = ImpDefIdx + 1;
2346 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2347
2348 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2349
2350 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2351
2352 MI.eraseFromParent();
2353 break;
2354 }
2355 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2356 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2357 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2358 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2359 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2360 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2361 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2362 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2363 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2364 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2365 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2366 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2368 Register Dst = MI.getOperand(0).getReg();
2369 Register VecReg = MI.getOperand(1).getReg();
2370 bool IsUndef = MI.getOperand(1).isUndef();
2371 Register Idx = MI.getOperand(2).getReg();
2372 Register SubReg = MI.getOperand(3).getImm();
2373
2374 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2375 .addReg(Idx)
2377 SetOn->getOperand(3).setIsUndef();
2378
2379 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2380 .addDef(Dst)
2381 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2382 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2383
2384 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2385
2386 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2387
2388 MI.eraseFromParent();
2389 break;
2390 }
2391 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2392 MachineFunction &MF = *MBB.getParent();
2393 Register Reg = MI.getOperand(0).getReg();
2394 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2395 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2396 MachineOperand OpLo = MI.getOperand(1);
2397 MachineOperand OpHi = MI.getOperand(2);
2398
2399 // Create a bundle so these instructions won't be re-ordered by the
2400 // post-RA scheduler.
2401 MIBundleBuilder Bundler(MBB, MI);
2402 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2403
2404 // What we want here is an offset from the value returned by s_getpc (which
2405 // is the address of the s_add_u32 instruction) to the global variable, but
2406 // since the encoding of $symbol starts 4 bytes after the start of the
2407 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2408 // small. This requires us to add 4 to the global variable offset in order
2409 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2410 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2411 // instruction.
2412
2413 int64_t Adjust = 0;
2414 if (ST.hasGetPCZeroExtension()) {
2415 // Fix up hardware that does not sign-extend the 48-bit PC value by
2416 // inserting: s_sext_i32_i16 reghi, reghi
2417 Bundler.append(
2418 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2419 Adjust += 4;
2420 }
2421
2422 if (OpLo.isGlobal())
2423 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2424 Bundler.append(
2425 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2426
2427 if (OpHi.isGlobal())
2428 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2429 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2430 .addReg(RegHi)
2431 .add(OpHi));
2432
2433 finalizeBundle(MBB, Bundler.begin());
2434
2435 MI.eraseFromParent();
2436 break;
2437 }
2438 case AMDGPU::ENTER_STRICT_WWM: {
2439 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2440 // Whole Wave Mode is entered.
2441 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2442 : AMDGPU::S_OR_SAVEEXEC_B64));
2443 break;
2444 }
2445 case AMDGPU::ENTER_STRICT_WQM: {
2446 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2447 // STRICT_WQM is entered.
2448 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2449 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2450 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2451 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2452 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2453
2454 MI.eraseFromParent();
2455 break;
2456 }
2457 case AMDGPU::EXIT_STRICT_WWM:
2458 case AMDGPU::EXIT_STRICT_WQM: {
2459 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2460 // WWM/STICT_WQM is exited.
2461 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2462 break;
2463 }
2464 case AMDGPU::ENTER_PSEUDO_WM:
2465 case AMDGPU::EXIT_PSEUDO_WM: {
2466 // These do nothing.
2467 MI.eraseFromParent();
2468 break;
2469 }
2470 case AMDGPU::SI_RETURN: {
2471 const MachineFunction *MF = MBB.getParent();
2472 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2473 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2474 // Hiding the return address use with SI_RETURN may lead to extra kills in
2475 // the function and missing live-ins. We are fine in practice because callee
2476 // saved register handling ensures the register value is restored before
2477 // RET, but we need the undef flag here to appease the MachineVerifier
2478 // liveness checks.
2480 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2481 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2482
2483 MIB.copyImplicitOps(MI);
2484 MI.eraseFromParent();
2485 break;
2486 }
2487
2488 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2489 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2490 MI.setDesc(get(AMDGPU::S_MUL_U64));
2491 break;
2492
2493 case AMDGPU::S_GETPC_B64_pseudo:
2494 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2495 if (ST.hasGetPCZeroExtension()) {
2496 Register Dst = MI.getOperand(0).getReg();
2497 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2498 // Fix up hardware that does not sign-extend the 48-bit PC value by
2499 // inserting: s_sext_i32_i16 dsthi, dsthi
2500 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2501 DstHi)
2502 .addReg(DstHi);
2503 }
2504 break;
2505 }
2506 return true;
2507}
2508
2511 unsigned SubIdx, const MachineInstr &Orig,
2512 const TargetRegisterInfo &RI) const {
2513
2514 // Try shrinking the instruction to remat only the part needed for current
2515 // context.
2516 // TODO: Handle more cases.
2517 unsigned Opcode = Orig.getOpcode();
2518 switch (Opcode) {
2519 case AMDGPU::S_LOAD_DWORDX16_IMM:
2520 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2521 if (SubIdx != 0)
2522 break;
2523
2524 if (I == MBB.end())
2525 break;
2526
2527 if (I->isBundled())
2528 break;
2529
2530 // Look for a single use of the register that is also a subreg.
2531 Register RegToFind = Orig.getOperand(0).getReg();
2532 MachineOperand *UseMO = nullptr;
2533 for (auto &CandMO : I->operands()) {
2534 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2535 continue;
2536 if (UseMO) {
2537 UseMO = nullptr;
2538 break;
2539 }
2540 UseMO = &CandMO;
2541 }
2542 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2543 break;
2544
2545 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2546 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2547
2550 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2551
2552 unsigned NewOpcode = -1;
2553 if (SubregSize == 256)
2554 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2555 else if (SubregSize == 128)
2556 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2557 else
2558 break;
2559
2560 const MCInstrDesc &TID = get(NewOpcode);
2561 const TargetRegisterClass *NewRC =
2562 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2563 MRI.setRegClass(DestReg, NewRC);
2564
2565 UseMO->setReg(DestReg);
2566 UseMO->setSubReg(AMDGPU::NoSubRegister);
2567
2568 // Use a smaller load with the desired size, possibly with updated offset.
2569 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2570 MI->setDesc(TID);
2571 MI->getOperand(0).setReg(DestReg);
2572 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2573 if (Offset) {
2574 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2575 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2576 OffsetMO->setImm(FinalOffset);
2577 }
2579 for (const MachineMemOperand *MemOp : Orig.memoperands())
2580 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2581 SubregSize / 8));
2582 MI->setMemRefs(*MF, NewMMOs);
2583
2584 MBB.insert(I, MI);
2585 return;
2586 }
2587
2588 default:
2589 break;
2590 }
2591
2592 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2593}
2594
2595std::pair<MachineInstr*, MachineInstr*>
2597 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2598
2599 if (ST.hasMovB64() &&
2601 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2602 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2603 return std::pair(&MI, nullptr);
2604 }
2605
2606 MachineBasicBlock &MBB = *MI.getParent();
2610 Register Dst = MI.getOperand(0).getReg();
2611 unsigned Part = 0;
2612 MachineInstr *Split[2];
2613
2614 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2615 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2616 if (Dst.isPhysical()) {
2617 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2618 } else {
2619 assert(MRI.isSSA());
2620 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2621 MovDPP.addDef(Tmp);
2622 }
2623
2624 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2625 const MachineOperand &SrcOp = MI.getOperand(I);
2626 assert(!SrcOp.isFPImm());
2627 if (SrcOp.isImm()) {
2628 APInt Imm(64, SrcOp.getImm());
2629 Imm.ashrInPlace(Part * 32);
2630 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2631 } else {
2632 assert(SrcOp.isReg());
2633 Register Src = SrcOp.getReg();
2634 if (Src.isPhysical())
2635 MovDPP.addReg(RI.getSubReg(Src, Sub));
2636 else
2637 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2638 }
2639 }
2640
2641 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2642 MovDPP.addImm(MO.getImm());
2643
2644 Split[Part] = MovDPP;
2645 ++Part;
2646 }
2647
2648 if (Dst.isVirtual())
2649 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2650 .addReg(Split[0]->getOperand(0).getReg())
2651 .addImm(AMDGPU::sub0)
2652 .addReg(Split[1]->getOperand(0).getReg())
2653 .addImm(AMDGPU::sub1);
2654
2655 MI.eraseFromParent();
2656 return std::pair(Split[0], Split[1]);
2657}
2658
2659std::optional<DestSourcePair>
2661 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2662 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2663
2664 return std::nullopt;
2665}
2666
2668 MachineOperand &Src0,
2669 unsigned Src0OpName,
2670 MachineOperand &Src1,
2671 unsigned Src1OpName) const {
2672 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2673 if (!Src0Mods)
2674 return false;
2675
2676 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2677 assert(Src1Mods &&
2678 "All commutable instructions have both src0 and src1 modifiers");
2679
2680 int Src0ModsVal = Src0Mods->getImm();
2681 int Src1ModsVal = Src1Mods->getImm();
2682
2683 Src1Mods->setImm(Src0ModsVal);
2684 Src0Mods->setImm(Src1ModsVal);
2685 return true;
2686}
2687
2689 MachineOperand &RegOp,
2690 MachineOperand &NonRegOp) {
2691 Register Reg = RegOp.getReg();
2692 unsigned SubReg = RegOp.getSubReg();
2693 bool IsKill = RegOp.isKill();
2694 bool IsDead = RegOp.isDead();
2695 bool IsUndef = RegOp.isUndef();
2696 bool IsDebug = RegOp.isDebug();
2697
2698 if (NonRegOp.isImm())
2699 RegOp.ChangeToImmediate(NonRegOp.getImm());
2700 else if (NonRegOp.isFI())
2701 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2702 else if (NonRegOp.isGlobal()) {
2703 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2704 NonRegOp.getTargetFlags());
2705 } else
2706 return nullptr;
2707
2708 // Make sure we don't reinterpret a subreg index in the target flags.
2709 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2710
2711 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2712 NonRegOp.setSubReg(SubReg);
2713
2714 return &MI;
2715}
2716
2718 unsigned Src0Idx,
2719 unsigned Src1Idx) const {
2720 assert(!NewMI && "this should never be used");
2721
2722 unsigned Opc = MI.getOpcode();
2723 int CommutedOpcode = commuteOpcode(Opc);
2724 if (CommutedOpcode == -1)
2725 return nullptr;
2726
2727 if (Src0Idx > Src1Idx)
2728 std::swap(Src0Idx, Src1Idx);
2729
2730 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2731 static_cast<int>(Src0Idx) &&
2732 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2733 static_cast<int>(Src1Idx) &&
2734 "inconsistency with findCommutedOpIndices");
2735
2736 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2737 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2738
2739 MachineInstr *CommutedMI = nullptr;
2740 if (Src0.isReg() && Src1.isReg()) {
2741 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2742 // Be sure to copy the source modifiers to the right place.
2743 CommutedMI
2744 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2745 }
2746
2747 } else if (Src0.isReg() && !Src1.isReg()) {
2748 // src0 should always be able to support any operand type, so no need to
2749 // check operand legality.
2750 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2751 } else if (!Src0.isReg() && Src1.isReg()) {
2752 if (isOperandLegal(MI, Src1Idx, &Src0))
2753 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2754 } else {
2755 // FIXME: Found two non registers to commute. This does happen.
2756 return nullptr;
2757 }
2758
2759 if (CommutedMI) {
2760 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2761 Src1, AMDGPU::OpName::src1_modifiers);
2762
2763 CommutedMI->setDesc(get(CommutedOpcode));
2764 }
2765
2766 return CommutedMI;
2767}
2768
2769// This needs to be implemented because the source modifiers may be inserted
2770// between the true commutable operands, and the base
2771// TargetInstrInfo::commuteInstruction uses it.
2773 unsigned &SrcOpIdx0,
2774 unsigned &SrcOpIdx1) const {
2775 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2776}
2777
2779 unsigned &SrcOpIdx0,
2780 unsigned &SrcOpIdx1) const {
2781 if (!Desc.isCommutable())
2782 return false;
2783
2784 unsigned Opc = Desc.getOpcode();
2785 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2786 if (Src0Idx == -1)
2787 return false;
2788
2789 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2790 if (Src1Idx == -1)
2791 return false;
2792
2793 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2794}
2795
2797 int64_t BrOffset) const {
2798 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2799 // block is unanalyzable.
2800 assert(BranchOp != AMDGPU::S_SETPC_B64);
2801
2802 // Convert to dwords.
2803 BrOffset /= 4;
2804
2805 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2806 // from the next instruction.
2807 BrOffset -= 1;
2808
2809 return isIntN(BranchOffsetBits, BrOffset);
2810}
2811
2814 return MI.getOperand(0).getMBB();
2815}
2816
2818 for (const MachineInstr &MI : MBB->terminators()) {
2819 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
2820 MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2821 MI.getOpcode() == AMDGPU::SI_LOOP)
2822 return true;
2823 }
2824 return false;
2825}
2826
2828 MachineBasicBlock &DestBB,
2829 MachineBasicBlock &RestoreBB,
2830 const DebugLoc &DL, int64_t BrOffset,
2831 RegScavenger *RS) const {
2832 assert(RS && "RegScavenger required for long branching");
2833 assert(MBB.empty() &&
2834 "new block should be inserted for expanding unconditional branch");
2835 assert(MBB.pred_size() == 1);
2836 assert(RestoreBB.empty() &&
2837 "restore block should be inserted for restoring clobbered registers");
2838
2842
2843 // FIXME: Virtual register workaround for RegScavenger not working with empty
2844 // blocks.
2845 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2846
2847 auto I = MBB.end();
2848
2849 // We need to compute the offset relative to the instruction immediately after
2850 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2851 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2852
2853 auto &MCCtx = MF->getContext();
2854 MCSymbol *PostGetPCLabel =
2855 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2856 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2857
2858 MCSymbol *OffsetLo =
2859 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2860 MCSymbol *OffsetHi =
2861 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2862 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2863 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2864 .addReg(PCReg, 0, AMDGPU::sub0)
2865 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2866 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2867 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2868 .addReg(PCReg, 0, AMDGPU::sub1)
2869 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2870
2871 // Insert the indirect branch after the other terminator.
2872 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2873 .addReg(PCReg);
2874
2875 // If a spill is needed for the pc register pair, we need to insert a spill
2876 // restore block right before the destination block, and insert a short branch
2877 // into the old destination block's fallthrough predecessor.
2878 // e.g.:
2879 //
2880 // s_cbranch_scc0 skip_long_branch:
2881 //
2882 // long_branch_bb:
2883 // spill s[8:9]
2884 // s_getpc_b64 s[8:9]
2885 // s_add_u32 s8, s8, restore_bb
2886 // s_addc_u32 s9, s9, 0
2887 // s_setpc_b64 s[8:9]
2888 //
2889 // skip_long_branch:
2890 // foo;
2891 //
2892 // .....
2893 //
2894 // dest_bb_fallthrough_predecessor:
2895 // bar;
2896 // s_branch dest_bb
2897 //
2898 // restore_bb:
2899 // restore s[8:9]
2900 // fallthrough dest_bb
2901 ///
2902 // dest_bb:
2903 // buzz;
2904
2905 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2906 Register Scav;
2907
2908 // If we've previously reserved a register for long branches
2909 // avoid running the scavenger and just use those registers
2910 if (LongBranchReservedReg) {
2911 RS->enterBasicBlock(MBB);
2912 Scav = LongBranchReservedReg;
2913 } else {
2915 Scav = RS->scavengeRegisterBackwards(
2916 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2917 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2918 }
2919 if (Scav) {
2920 RS->setRegUsed(Scav);
2921 MRI.replaceRegWith(PCReg, Scav);
2922 MRI.clearVirtRegs();
2923 } else {
2924 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2925 // SGPR spill.
2926 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2927 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2928 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2929 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2930 MRI.clearVirtRegs();
2931 }
2932
2933 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2934 // Now, the distance could be defined.
2936 MCSymbolRefExpr::create(DestLabel, MCCtx),
2937 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2938 // Add offset assignments.
2939 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2940 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2941 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2942 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2943}
2944
2945unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2946 switch (Cond) {
2947 case SIInstrInfo::SCC_TRUE:
2948 return AMDGPU::S_CBRANCH_SCC1;
2949 case SIInstrInfo::SCC_FALSE:
2950 return AMDGPU::S_CBRANCH_SCC0;
2951 case SIInstrInfo::VCCNZ:
2952 return AMDGPU::S_CBRANCH_VCCNZ;
2953 case SIInstrInfo::VCCZ:
2954 return AMDGPU::S_CBRANCH_VCCZ;
2955 case SIInstrInfo::EXECNZ:
2956 return AMDGPU::S_CBRANCH_EXECNZ;
2957 case SIInstrInfo::EXECZ:
2958 return AMDGPU::S_CBRANCH_EXECZ;
2959 default:
2960 llvm_unreachable("invalid branch predicate");
2961 }
2962}
2963
2964SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
2965 switch (Opcode) {
2966 case AMDGPU::S_CBRANCH_SCC0:
2967 return SCC_FALSE;
2968 case AMDGPU::S_CBRANCH_SCC1:
2969 return SCC_TRUE;
2970 case AMDGPU::S_CBRANCH_VCCNZ:
2971 return VCCNZ;
2972 case AMDGPU::S_CBRANCH_VCCZ:
2973 return VCCZ;
2974 case AMDGPU::S_CBRANCH_EXECNZ:
2975 return EXECNZ;
2976 case AMDGPU::S_CBRANCH_EXECZ:
2977 return EXECZ;
2978 default:
2979 return INVALID_BR;
2980 }
2981}
2982
2986 MachineBasicBlock *&FBB,
2988 bool AllowModify) const {
2989 if (I->getOpcode() == AMDGPU::S_BRANCH) {
2990 // Unconditional Branch
2991 TBB = I->getOperand(0).getMBB();
2992 return false;
2993 }
2994
2995 MachineBasicBlock *CondBB = nullptr;
2996
2997 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
2998 CondBB = I->getOperand(1).getMBB();
2999 Cond.push_back(I->getOperand(0));
3000 } else {
3001 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3002 if (Pred == INVALID_BR)
3003 return true;
3004
3005 CondBB = I->getOperand(0).getMBB();
3006 Cond.push_back(MachineOperand::CreateImm(Pred));
3007 Cond.push_back(I->getOperand(1)); // Save the branch register.
3008 }
3009 ++I;
3010
3011 if (I == MBB.end()) {
3012 // Conditional branch followed by fall-through.
3013 TBB = CondBB;
3014 return false;
3015 }
3016
3017 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3018 TBB = CondBB;
3019 FBB = I->getOperand(0).getMBB();
3020 return false;
3021 }
3022
3023 return true;
3024}
3025
3027 MachineBasicBlock *&FBB,
3029 bool AllowModify) const {
3031 auto E = MBB.end();
3032 if (I == E)
3033 return false;
3034
3035 // Skip over the instructions that are artificially terminators for special
3036 // exec management.
3037 while (I != E && !I->isBranch() && !I->isReturn()) {
3038 switch (I->getOpcode()) {
3039 case AMDGPU::S_MOV_B64_term:
3040 case AMDGPU::S_XOR_B64_term:
3041 case AMDGPU::S_OR_B64_term:
3042 case AMDGPU::S_ANDN2_B64_term:
3043 case AMDGPU::S_AND_B64_term:
3044 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3045 case AMDGPU::S_MOV_B32_term:
3046 case AMDGPU::S_XOR_B32_term:
3047 case AMDGPU::S_OR_B32_term:
3048 case AMDGPU::S_ANDN2_B32_term:
3049 case AMDGPU::S_AND_B32_term:
3050 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3051 break;
3052 case AMDGPU::SI_IF:
3053 case AMDGPU::SI_ELSE:
3054 case AMDGPU::SI_KILL_I1_TERMINATOR:
3055 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3056 // FIXME: It's messy that these need to be considered here at all.
3057 return true;
3058 default:
3059 llvm_unreachable("unexpected non-branch terminator inst");
3060 }
3061
3062 ++I;
3063 }
3064
3065 if (I == E)
3066 return false;
3067
3068 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3069}
3070
3072 int *BytesRemoved) const {
3073 unsigned Count = 0;
3074 unsigned RemovedSize = 0;
3076 // Skip over artificial terminators when removing instructions.
3077 if (MI.isBranch() || MI.isReturn()) {
3078 RemovedSize += getInstSizeInBytes(MI);
3079 MI.eraseFromParent();
3080 ++Count;
3081 }
3082 }
3083
3084 if (BytesRemoved)
3085 *BytesRemoved = RemovedSize;
3086
3087 return Count;
3088}
3089
3090// Copy the flags onto the implicit condition register operand.
3092 const MachineOperand &OrigCond) {
3093 CondReg.setIsUndef(OrigCond.isUndef());
3094 CondReg.setIsKill(OrigCond.isKill());
3095}
3096
3099 MachineBasicBlock *FBB,
3101 const DebugLoc &DL,
3102 int *BytesAdded) const {
3103 if (!FBB && Cond.empty()) {
3104 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3105 .addMBB(TBB);
3106 if (BytesAdded)
3107 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3108 return 1;
3109 }
3110
3111 if(Cond.size() == 1 && Cond[0].isReg()) {
3112 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
3113 .add(Cond[0])
3114 .addMBB(TBB);
3115 return 1;
3116 }
3117
3118 assert(TBB && Cond[0].isImm());
3119
3120 unsigned Opcode
3121 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3122
3123 if (!FBB) {
3124 MachineInstr *CondBr =
3125 BuildMI(&MBB, DL, get(Opcode))
3126 .addMBB(TBB);
3127
3128 // Copy the flags onto the implicit condition register operand.
3129 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3130 fixImplicitOperands(*CondBr);
3131
3132 if (BytesAdded)
3133 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3134 return 1;
3135 }
3136
3137 assert(TBB && FBB);
3138
3139 MachineInstr *CondBr =
3140 BuildMI(&MBB, DL, get(Opcode))
3141 .addMBB(TBB);
3142 fixImplicitOperands(*CondBr);
3143 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3144 .addMBB(FBB);
3145
3146 MachineOperand &CondReg = CondBr->getOperand(1);
3147 CondReg.setIsUndef(Cond[1].isUndef());
3148 CondReg.setIsKill(Cond[1].isKill());
3149
3150 if (BytesAdded)
3151 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3152
3153 return 2;
3154}
3155
3158 if (Cond.size() != 2) {
3159 return true;
3160 }
3161
3162 if (Cond[0].isImm()) {
3163 Cond[0].setImm(-Cond[0].getImm());
3164 return false;
3165 }
3166
3167 return true;
3168}
3169
3172 Register DstReg, Register TrueReg,
3173 Register FalseReg, int &CondCycles,
3174 int &TrueCycles, int &FalseCycles) const {
3175 switch (Cond[0].getImm()) {
3176 case VCCNZ:
3177 case VCCZ: {
3179 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3180 if (MRI.getRegClass(FalseReg) != RC)
3181 return false;
3182
3183 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3184 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3185
3186 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3187 return RI.hasVGPRs(RC) && NumInsts <= 6;
3188 }
3189 case SCC_TRUE:
3190 case SCC_FALSE: {
3191 // FIXME: We could insert for VGPRs if we could replace the original compare
3192 // with a vector one.
3194 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3195 if (MRI.getRegClass(FalseReg) != RC)
3196 return false;
3197
3198 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3199
3200 // Multiples of 8 can do s_cselect_b64
3201 if (NumInsts % 2 == 0)
3202 NumInsts /= 2;
3203
3204 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3205 return RI.isSGPRClass(RC);
3206 }
3207 default:
3208 return false;
3209 }
3210}
3211
3215 Register TrueReg, Register FalseReg) const {
3216 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3217 if (Pred == VCCZ || Pred == SCC_FALSE) {
3218 Pred = static_cast<BranchPredicate>(-Pred);
3219 std::swap(TrueReg, FalseReg);
3220 }
3221
3223 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3224 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3225
3226 if (DstSize == 32) {
3228 if (Pred == SCC_TRUE) {
3229 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3230 .addReg(TrueReg)
3231 .addReg(FalseReg);
3232 } else {
3233 // Instruction's operands are backwards from what is expected.
3234 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3235 .addReg(FalseReg)
3236 .addReg(TrueReg);
3237 }
3238
3239 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3240 return;
3241 }
3242
3243 if (DstSize == 64 && Pred == SCC_TRUE) {
3245 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3246 .addReg(TrueReg)
3247 .addReg(FalseReg);
3248
3249 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3250 return;
3251 }
3252
3253 static const int16_t Sub0_15[] = {
3254 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3255 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3256 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3257 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3258 };
3259
3260 static const int16_t Sub0_15_64[] = {
3261 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3262 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3263 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3264 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3265 };
3266
3267 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3268 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3269 const int16_t *SubIndices = Sub0_15;
3270 int NElts = DstSize / 32;
3271
3272 // 64-bit select is only available for SALU.
3273 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3274 if (Pred == SCC_TRUE) {
3275 if (NElts % 2) {
3276 SelOp = AMDGPU::S_CSELECT_B32;
3277 EltRC = &AMDGPU::SGPR_32RegClass;
3278 } else {
3279 SelOp = AMDGPU::S_CSELECT_B64;
3280 EltRC = &AMDGPU::SGPR_64RegClass;
3281 SubIndices = Sub0_15_64;
3282 NElts /= 2;
3283 }
3284 }
3285
3287 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3288
3289 I = MIB->getIterator();
3290
3292 for (int Idx = 0; Idx != NElts; ++Idx) {
3293 Register DstElt = MRI.createVirtualRegister(EltRC);
3294 Regs.push_back(DstElt);
3295
3296 unsigned SubIdx = SubIndices[Idx];
3297
3299 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3300 Select =
3301 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3302 .addReg(FalseReg, 0, SubIdx)
3303 .addReg(TrueReg, 0, SubIdx);
3304 } else {
3305 Select =
3306 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3307 .addReg(TrueReg, 0, SubIdx)
3308 .addReg(FalseReg, 0, SubIdx);
3309 }
3310
3311 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3313
3314 MIB.addReg(DstElt)
3315 .addImm(SubIdx);
3316 }
3317}
3318
3320 switch (MI.getOpcode()) {
3321 case AMDGPU::V_MOV_B32_e32:
3322 case AMDGPU::V_MOV_B32_e64:
3323 case AMDGPU::V_MOV_B64_PSEUDO:
3324 case AMDGPU::V_MOV_B64_e32:
3325 case AMDGPU::V_MOV_B64_e64:
3326 case AMDGPU::S_MOV_B32:
3327 case AMDGPU::S_MOV_B64:
3328 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3329 case AMDGPU::COPY:
3330 case AMDGPU::WWM_COPY:
3331 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3332 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3333 case AMDGPU::V_ACCVGPR_MOV_B32:
3334 return true;
3335 default:
3336 return false;
3337 }
3338}
3339
3340static constexpr unsigned ModifierOpNames[] = {
3341 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3342 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3343 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3344
3346 unsigned Opc = MI.getOpcode();
3347 for (unsigned Name : reverse(ModifierOpNames)) {
3349 if (Idx >= 0)
3350 MI.removeOperand(Idx);
3351 }
3352}
3353
3355 Register Reg, MachineRegisterInfo *MRI) const {
3356 if (!MRI->hasOneNonDBGUse(Reg))
3357 return false;
3358
3359 switch (DefMI.getOpcode()) {
3360 default:
3361 return false;
3362 case AMDGPU::V_MOV_B64_e32:
3363 case AMDGPU::S_MOV_B64:
3364 case AMDGPU::V_MOV_B64_PSEUDO:
3365 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3366 case AMDGPU::V_MOV_B32_e32:
3367 case AMDGPU::S_MOV_B32:
3368 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3369 break;
3370 }
3371
3372 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3373 assert(ImmOp);
3374 // FIXME: We could handle FrameIndex values here.
3375 if (!ImmOp->isImm())
3376 return false;
3377
3378 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3379 int64_t Imm = ImmOp->getImm();
3380 switch (UseOp.getSubReg()) {
3381 default:
3382 return Imm;
3383 case AMDGPU::sub0:
3384 return Lo_32(Imm);
3385 case AMDGPU::sub1:
3386 return Hi_32(Imm);
3387 case AMDGPU::lo16:
3388 return APInt(16, Imm).getSExtValue();
3389 case AMDGPU::hi16:
3390 return APInt(32, Imm).ashr(16).getSExtValue();
3391 case AMDGPU::sub1_lo16:
3392 return APInt(16, Hi_32(Imm)).getSExtValue();
3393 case AMDGPU::sub1_hi16:
3394 return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();
3395 }
3396 };
3397
3398 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3399
3400 unsigned Opc = UseMI.getOpcode();
3401 if (Opc == AMDGPU::COPY) {
3402 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3403
3404 Register DstReg = UseMI.getOperand(0).getReg();
3405 unsigned OpSize = getOpSize(UseMI, 0);
3406 bool Is16Bit = OpSize == 2;
3407 bool Is64Bit = OpSize == 8;
3408 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3409 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3410 : AMDGPU::V_MOV_B32_e32
3411 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3412 : AMDGPU::S_MOV_B32;
3413 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
3414
3415 if (RI.isAGPR(*MRI, DstReg)) {
3416 if (Is64Bit || !isInlineConstant(Imm))
3417 return false;
3418 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3419 }
3420
3421 if (Is16Bit) {
3422 if (isVGPRCopy)
3423 return false; // Do not clobber vgpr_hi16
3424
3425 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3426 return false;
3427
3428 UseMI.getOperand(0).setSubReg(0);
3429 if (DstReg.isPhysical()) {
3430 DstReg = RI.get32BitRegister(DstReg);
3431 UseMI.getOperand(0).setReg(DstReg);
3432 }
3433 assert(UseMI.getOperand(1).getReg().isVirtual());
3434 }
3435
3436 const MCInstrDesc &NewMCID = get(NewOpc);
3437 if (DstReg.isPhysical() &&
3438 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3439 return false;
3440
3441 UseMI.setDesc(NewMCID);
3442 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3443 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3444 return true;
3445 }
3446
3447 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3448 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3449 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3450 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3451 Opc == AMDGPU::V_FMAC_F16_t16_e64) {
3452 // Don't fold if we are using source or output modifiers. The new VOP2
3453 // instructions don't have them.
3455 return false;
3456
3457 // If this is a free constant, there's no reason to do this.
3458 // TODO: We could fold this here instead of letting SIFoldOperands do it
3459 // later.
3460 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3461
3462 // Any src operand can be used for the legality check.
3463 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3464 return false;
3465
3466 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3467 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3468 bool IsFMA =
3469 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3470 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3471 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3472 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3473 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3474
3475 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3476 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3477 (Src1->isReg() && Src1->getReg() == Reg)) {
3478 MachineOperand *RegSrc =
3479 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3480 if (!RegSrc->isReg())
3481 return false;
3482 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3483 ST.getConstantBusLimit(Opc) < 2)
3484 return false;
3485
3486 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3487 return false;
3488
3489 // If src2 is also a literal constant then we have to choose which one to
3490 // fold. In general it is better to choose madak so that the other literal
3491 // can be materialized in an sgpr instead of a vgpr:
3492 // s_mov_b32 s0, literal
3493 // v_madak_f32 v0, s0, v0, literal
3494 // Instead of:
3495 // v_mov_b32 v1, literal
3496 // v_madmk_f32 v0, v0, literal, v1
3497 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3498 if (Def && Def->isMoveImmediate() &&
3499 !isInlineConstant(Def->getOperand(1)))
3500 return false;
3501
3502 unsigned NewOpc =
3503 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3504 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3505 : AMDGPU::V_FMAMK_F16)
3506 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3507 if (pseudoToMCOpcode(NewOpc) == -1)
3508 return false;
3509
3510 // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3511 // would also require restricting their register classes. For now
3512 // just bail out.
3513 if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
3514 return false;
3515
3516 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3517
3518 // FIXME: This would be a lot easier if we could return a new instruction
3519 // instead of having to modify in place.
3520
3521 Register SrcReg = RegSrc->getReg();
3522 unsigned SrcSubReg = RegSrc->getSubReg();
3523 Src0->setReg(SrcReg);
3524 Src0->setSubReg(SrcSubReg);
3525 Src0->setIsKill(RegSrc->isKill());
3526
3527 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3528 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3529 Opc == AMDGPU::V_FMAC_F16_e64)
3530 UseMI.untieRegOperand(
3531 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3532
3533 Src1->ChangeToImmediate(Imm);
3534
3536 UseMI.setDesc(get(NewOpc));
3537
3538 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3539 if (DeleteDef)
3540 DefMI.eraseFromParent();
3541
3542 return true;
3543 }
3544
3545 // Added part is the constant: Use v_madak_{f16, f32}.
3546 if (Src2->isReg() && Src2->getReg() == Reg) {
3547 if (ST.getConstantBusLimit(Opc) < 2) {
3548 // Not allowed to use constant bus for another operand.
3549 // We can however allow an inline immediate as src0.
3550 bool Src0Inlined = false;
3551 if (Src0->isReg()) {
3552 // Try to inline constant if possible.
3553 // If the Def moves immediate and the use is single
3554 // We are saving VGPR here.
3555 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3556 if (Def && Def->isMoveImmediate() &&
3557 isInlineConstant(Def->getOperand(1)) &&
3558 MRI->hasOneUse(Src0->getReg())) {
3559 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3560 Src0Inlined = true;
3561 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3562 RI.isSGPRReg(*MRI, Src0->getReg())) {
3563 return false;
3564 }
3565 // VGPR is okay as Src0 - fallthrough
3566 }
3567
3568 if (Src1->isReg() && !Src0Inlined) {
3569 // We have one slot for inlinable constant so far - try to fill it
3570 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3571 if (Def && Def->isMoveImmediate() &&
3572 isInlineConstant(Def->getOperand(1)) &&
3573 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3574 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3575 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3576 return false;
3577 // VGPR is okay as Src1 - fallthrough
3578 }
3579 }
3580
3581 unsigned NewOpc =
3582 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3583 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3584 : AMDGPU::V_FMAAK_F16)
3585 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3586 if (pseudoToMCOpcode(NewOpc) == -1)
3587 return false;
3588
3589 // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3590 // would also require restricting their register classes. For now
3591 // just bail out.
3592 if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
3593 return false;
3594
3595 // FIXME: This would be a lot easier if we could return a new instruction
3596 // instead of having to modify in place.
3597
3598 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3599 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3600 Opc == AMDGPU::V_FMAC_F16_e64)
3601 UseMI.untieRegOperand(
3602 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3603
3604 // ChangingToImmediate adds Src2 back to the instruction.
3605 Src2->ChangeToImmediate(getImmFor(*Src2));
3606
3607 // These come before src2.
3609 UseMI.setDesc(get(NewOpc));
3610 // It might happen that UseMI was commuted
3611 // and we now have SGPR as SRC1. If so 2 inlined
3612 // constant and SGPR are illegal.
3614
3615 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3616 if (DeleteDef)
3617 DefMI.eraseFromParent();
3618
3619 return true;
3620 }
3621 }
3622
3623 return false;
3624}
3625
3626static bool
3629 if (BaseOps1.size() != BaseOps2.size())
3630 return false;
3631 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3632 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3633 return false;
3634 }
3635 return true;
3636}
3637
3638static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3639 LocationSize WidthB, int OffsetB) {
3640 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3641 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3642 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3643 return LowWidth.hasValue() &&
3644 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3645}
3646
3647bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3648 const MachineInstr &MIb) const {
3649 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3650 int64_t Offset0, Offset1;
3651 LocationSize Dummy0 = 0, Dummy1 = 0;
3652 bool Offset0IsScalable, Offset1IsScalable;
3653 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3654 Dummy0, &RI) ||
3655 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3656 Dummy1, &RI))
3657 return false;
3658
3659 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3660 return false;
3661
3662 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3663 // FIXME: Handle ds_read2 / ds_write2.
3664 return false;
3665 }
3666 LocationSize Width0 = MIa.memoperands().front()->getSize();
3667 LocationSize Width1 = MIb.memoperands().front()->getSize();
3668 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3669}
3670
3672 const MachineInstr &MIb) const {
3673 assert(MIa.mayLoadOrStore() &&
3674 "MIa must load from or modify a memory location");
3675 assert(MIb.mayLoadOrStore() &&
3676 "MIb must load from or modify a memory location");
3677
3679 return false;
3680
3681 // XXX - Can we relax this between address spaces?
3682 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3683 return false;
3684
3685 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3686 return false;
3687
3688 // TODO: Should we check the address space from the MachineMemOperand? That
3689 // would allow us to distinguish objects we know don't alias based on the
3690 // underlying address space, even if it was lowered to a different one,
3691 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3692 // buffer.
3693 if (isDS(MIa)) {
3694 if (isDS(MIb))
3695 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3696
3697 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3698 }
3699
3700 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3701 if (isMUBUF(MIb) || isMTBUF(MIb))
3702 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3703
3704 if (isFLAT(MIb))
3705 return isFLATScratch(MIb);
3706
3707 return !isSMRD(MIb);
3708 }
3709
3710 if (isSMRD(MIa)) {
3711 if (isSMRD(MIb))
3712 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3713
3714 if (isFLAT(MIb))
3715 return isFLATScratch(MIb);
3716
3717 return !isMUBUF(MIb) && !isMTBUF(MIb);
3718 }
3719
3720 if (isFLAT(MIa)) {
3721 if (isFLAT(MIb)) {
3722 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3723 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3724 return true;
3725
3726 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3727 }
3728
3729 return false;
3730 }
3731
3732 return false;
3733}
3734
3736 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3737 if (Reg.isPhysical())
3738 return false;
3739 auto *Def = MRI.getUniqueVRegDef(Reg);
3740 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3741 Imm = Def->getOperand(1).getImm();
3742 if (DefMI)
3743 *DefMI = Def;
3744 return true;
3745 }
3746 return false;
3747}
3748
3749static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3750 MachineInstr **DefMI = nullptr) {
3751 if (!MO->isReg())
3752 return false;
3753 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3754 const MachineRegisterInfo &MRI = MF->getRegInfo();
3755 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3756}
3757
3759 MachineInstr &NewMI) {
3760 if (LV) {
3761 unsigned NumOps = MI.getNumOperands();
3762 for (unsigned I = 1; I < NumOps; ++I) {
3763 MachineOperand &Op = MI.getOperand(I);
3764 if (Op.isReg() && Op.isKill())
3765 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3766 }
3767 }
3768}
3769
3771 LiveVariables *LV,
3772 LiveIntervals *LIS) const {
3773 MachineBasicBlock &MBB = *MI.getParent();
3774 unsigned Opc = MI.getOpcode();
3775
3776 // Handle MFMA.
3777 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3778 if (NewMFMAOpc != -1) {
3780 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3781 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3782 MIB.add(MI.getOperand(I));
3783 updateLiveVariables(LV, MI, *MIB);
3784 if (LIS) {
3785 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3786 // SlotIndex of defs needs to be updated when converting to early-clobber
3787 MachineOperand &Def = MIB->getOperand(0);
3788 if (Def.isEarlyClobber() && Def.isReg() &&
3789 LIS->hasInterval(Def.getReg())) {
3790 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3791 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3792 auto &LI = LIS->getInterval(Def.getReg());
3793 auto UpdateDefIndex = [&](LiveRange &LR) {
3794 auto S = LR.find(OldIndex);
3795 if (S != LR.end() && S->start == OldIndex) {
3796 assert(S->valno && S->valno->def == OldIndex);
3797 S->start = NewIndex;
3798 S->valno->def = NewIndex;
3799 }
3800 };
3801 UpdateDefIndex(LI);
3802 for (auto &SR : LI.subranges())
3803 UpdateDefIndex(SR);
3804 }
3805 }
3806 return MIB;
3807 }
3808
3809 if (SIInstrInfo::isWMMA(MI)) {
3810 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3811 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3812 .setMIFlags(MI.getFlags());
3813 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3814 MIB->addOperand(MI.getOperand(I));
3815
3816 updateLiveVariables(LV, MI, *MIB);
3817 if (LIS)
3818 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3819
3820 return MIB;
3821 }
3822
3823 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3824 "V_FMAC_F16_t16_e32 is not supported and not expected to be present "
3825 "pre-RA");
3826
3827 // Handle MAC/FMAC.
3828 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3829 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3830 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3831 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3832 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3833 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3834 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3835 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3836 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3837 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3838 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3839 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3840 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3841 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3842 bool Src0Literal = false;
3843
3844 switch (Opc) {
3845 default:
3846 return nullptr;
3847 case AMDGPU::V_MAC_F16_e64:
3848 case AMDGPU::V_FMAC_F16_e64:
3849 case AMDGPU::V_FMAC_F16_t16_e64:
3850 case AMDGPU::V_MAC_F32_e64:
3851 case AMDGPU::V_MAC_LEGACY_F32_e64:
3852 case AMDGPU::V_FMAC_F32_e64:
3853 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3854 case AMDGPU::V_FMAC_F64_e64:
3855 break;
3856 case AMDGPU::V_MAC_F16_e32:
3857 case AMDGPU::V_FMAC_F16_e32:
3858 case AMDGPU::V_MAC_F32_e32:
3859 case AMDGPU::V_MAC_LEGACY_F32_e32:
3860 case AMDGPU::V_FMAC_F32_e32:
3861 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3862 case AMDGPU::V_FMAC_F64_e32: {
3863 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3864 AMDGPU::OpName::src0);
3865 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3866 if (!Src0->isReg() && !Src0->isImm())
3867 return nullptr;
3868
3869 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3870 Src0Literal = true;
3871
3872 break;
3873 }
3874 }
3875
3877 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3878 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3879 const MachineOperand *Src0Mods =
3880 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3881 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3882 const MachineOperand *Src1Mods =
3883 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3884 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3885 const MachineOperand *Src2Mods =
3886 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3887 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3888 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3889 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3890
3891 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3892 !IsLegacy &&
3893 // If we have an SGPR input, we will violate the constant bus restriction.
3894 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3895 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3897 const auto killDef = [&]() -> void {
3899 // The only user is the instruction which will be killed.
3900 Register DefReg = DefMI->getOperand(0).getReg();
3901 if (!MRI.hasOneNonDBGUse(DefReg))
3902 return;
3903 // We cannot just remove the DefMI here, calling pass will crash.
3904 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3905 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3907 if (LV)
3908 LV->getVarInfo(DefReg).AliveBlocks.clear();
3909 };
3910
3911 int64_t Imm;
3912 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3913 unsigned NewOpc =
3914 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3915 : AMDGPU::V_FMAAK_F16)
3916 : AMDGPU::V_FMAAK_F32)
3917 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3918 if (pseudoToMCOpcode(NewOpc) != -1) {
3919 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3920 .add(*Dst)
3921 .add(*Src0)
3922 .add(*Src1)
3923 .addImm(Imm);
3924 updateLiveVariables(LV, MI, *MIB);
3925 if (LIS)
3926 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3927 killDef();
3928 return MIB;
3929 }
3930 }
3931 unsigned NewOpc =
3932 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3933 : AMDGPU::V_FMAMK_F16)
3934 : AMDGPU::V_FMAMK_F32)
3935 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3936 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3937 if (pseudoToMCOpcode(NewOpc) != -1) {
3938 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3939 .add(*Dst)
3940 .add(*Src0)
3941 .addImm(Imm)
3942 .add(*Src2);
3943 updateLiveVariables(LV, MI, *MIB);
3944 if (LIS)
3945 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3946 killDef();
3947 return MIB;
3948 }
3949 }
3950 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
3951 if (Src0Literal) {
3952 Imm = Src0->getImm();
3953 DefMI = nullptr;
3954 }
3955 if (pseudoToMCOpcode(NewOpc) != -1 &&
3957 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
3958 Src1)) {
3959 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3960 .add(*Dst)
3961 .add(*Src1)
3962 .addImm(Imm)
3963 .add(*Src2);
3964 updateLiveVariables(LV, MI, *MIB);
3965 if (LIS)
3966 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3967 if (DefMI)
3968 killDef();
3969 return MIB;
3970 }
3971 }
3972 }
3973
3974 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
3975 // if VOP3 does not allow a literal operand.
3976 if (Src0Literal && !ST.hasVOP3Literal())
3977 return nullptr;
3978
3979 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
3980 : IsF64 ? AMDGPU::V_FMA_F64_e64
3981 : IsLegacy
3982 ? AMDGPU::V_FMA_LEGACY_F32_e64
3983 : AMDGPU::V_FMA_F32_e64
3984 : IsF16 ? AMDGPU::V_MAD_F16_e64
3985 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
3986 : AMDGPU::V_MAD_F32_e64;
3987 if (pseudoToMCOpcode(NewOpc) == -1)
3988 return nullptr;
3989
3990 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3991 .add(*Dst)
3992 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
3993 .add(*Src0)
3994 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
3995 .add(*Src1)
3996 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
3997 .add(*Src2)
3998 .addImm(Clamp ? Clamp->getImm() : 0)
3999 .addImm(Omod ? Omod->getImm() : 0);
4000 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4001 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4002 updateLiveVariables(LV, MI, *MIB);
4003 if (LIS)
4004 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4005 return MIB;
4006}
4007
4008// It's not generally safe to move VALU instructions across these since it will
4009// start using the register as a base index rather than directly.
4010// XXX - Why isn't hasSideEffects sufficient for these?
4012 switch (MI.getOpcode()) {
4013 case AMDGPU::S_SET_GPR_IDX_ON:
4014 case AMDGPU::S_SET_GPR_IDX_MODE:
4015 case AMDGPU::S_SET_GPR_IDX_OFF:
4016 return true;
4017 default:
4018 return false;
4019 }
4020}
4021
4023 const MachineBasicBlock *MBB,
4024 const MachineFunction &MF) const {
4025 // Skipping the check for SP writes in the base implementation. The reason it
4026 // was added was apparently due to compile time concerns.
4027 //
4028 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4029 // but is probably avoidable.
4030
4031 // Copied from base implementation.
4032 // Terminators and labels can't be scheduled around.
4033 if (MI.isTerminator() || MI.isPosition())
4034 return true;
4035
4036 // INLINEASM_BR can jump to another block
4037 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4038 return true;
4039
4040 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4041 return true;
4042
4043 // Target-independent instructions do not have an implicit-use of EXEC, even
4044 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4045 // boundaries prevents incorrect movements of such instructions.
4046 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4047 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4048 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4049 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4051}
4052
4054 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4055}
4056
4058 // Skip the full operand and register alias search modifiesRegister
4059 // does. There's only a handful of instructions that touch this, it's only an
4060 // implicit def, and doesn't alias any other registers.
4061 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4062}
4063
4065 unsigned Opcode = MI.getOpcode();
4066
4067 if (MI.mayStore() && isSMRD(MI))
4068 return true; // scalar store or atomic
4069
4070 // This will terminate the function when other lanes may need to continue.
4071 if (MI.isReturn())
4072 return true;
4073
4074 // These instructions cause shader I/O that may cause hardware lockups
4075 // when executed with an empty EXEC mask.
4076 //
4077 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4078 // EXEC = 0, but checking for that case here seems not worth it
4079 // given the typical code patterns.
4080 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4081 isEXP(Opcode) ||
4082 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
4083 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
4084 return true;
4085
4086 if (MI.isCall() || MI.isInlineAsm())
4087 return true; // conservative assumption
4088
4089 // A mode change is a scalar operation that influences vector instructions.
4091 return true;
4092
4093 // These are like SALU instructions in terms of effects, so it's questionable
4094 // whether we should return true for those.
4095 //
4096 // However, executing them with EXEC = 0 causes them to operate on undefined
4097 // data, which we avoid by returning true here.
4098 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4099 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4100 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4101 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4102 return true;
4103
4104 return false;
4105}
4106
4108 const MachineInstr &MI) const {
4109 if (MI.isMetaInstruction())
4110 return false;
4111
4112 // This won't read exec if this is an SGPR->SGPR copy.
4113 if (MI.isCopyLike()) {
4114 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4115 return true;
4116
4117 // Make sure this isn't copying exec as a normal operand
4118 return MI.readsRegister(AMDGPU::EXEC, &RI);
4119 }
4120
4121 // Make a conservative assumption about the callee.
4122 if (MI.isCall())
4123 return true;
4124
4125 // Be conservative with any unhandled generic opcodes.
4126 if (!isTargetSpecificOpcode(MI.getOpcode()))
4127 return true;
4128
4129 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4130}
4131
4132bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4133 switch (Imm.getBitWidth()) {
4134 case 1: // This likely will be a condition code mask.
4135 return true;
4136
4137 case 32:
4138 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4139 ST.hasInv2PiInlineImm());
4140 case 64:
4141 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4142 ST.hasInv2PiInlineImm());
4143 case 16:
4144 return ST.has16BitInsts() &&
4145 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4146 ST.hasInv2PiInlineImm());
4147 default:
4148 llvm_unreachable("invalid bitwidth");
4149 }
4150}
4151
4153 APInt IntImm = Imm.bitcastToAPInt();
4154 int64_t IntImmVal = IntImm.getSExtValue();
4155 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4156 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4157 default:
4158 llvm_unreachable("invalid fltSemantics");
4161 return isInlineConstant(IntImm);
4163 return ST.has16BitInsts() &&
4164 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4166 return ST.has16BitInsts() &&
4167 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4168 }
4169}
4170
4172 uint8_t OperandType) const {
4173 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4174 if (!MO.isImm())
4175 return false;
4176
4177 // MachineOperand provides no way to tell the true operand size, since it only
4178 // records a 64-bit value. We need to know the size to determine if a 32-bit
4179 // floating point immediate bit pattern is legal for an integer immediate. It
4180 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4181
4182 int64_t Imm = MO.getImm();
4183 switch (OperandType) {
4196 int32_t Trunc = static_cast<int32_t>(Imm);
4198 }
4205 ST.hasInv2PiInlineImm());
4209 // We would expect inline immediates to not be concerned with an integer/fp
4210 // distinction. However, in the case of 16-bit integer operations, the
4211 // "floating point" values appear to not work. It seems read the low 16-bits
4212 // of 32-bit immediates, which happens to always work for the integer
4213 // values.
4214 //
4215 // See llvm bugzilla 46302.
4216 //
4217 // TODO: Theoretically we could use op-sel to use the high bits of the
4218 // 32-bit FP values.
4236 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4237 // A few special case instructions have 16-bit operands on subtargets
4238 // where 16-bit instructions are not legal.
4239 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4240 // constants in these cases
4241 int16_t Trunc = static_cast<int16_t>(Imm);
4242 return ST.has16BitInsts() &&
4244 }
4245
4246 return false;
4247 }
4252 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4253 int16_t Trunc = static_cast<int16_t>(Imm);
4254 return ST.has16BitInsts() &&
4256 }
4257 return false;
4258 }
4261 return false;
4264 // Always embedded in the instruction for free.
4265 return true;
4275 // Just ignore anything else.
4276 return true;
4277 default:
4278 llvm_unreachable("invalid operand type");
4279 }
4280}
4281
4282static bool compareMachineOp(const MachineOperand &Op0,
4283 const MachineOperand &Op1) {
4284 if (Op0.getType() != Op1.getType())
4285 return false;
4286
4287 switch (Op0.getType()) {
4289 return Op0.getReg() == Op1.getReg();
4291 return Op0.getImm() == Op1.getImm();
4292 default:
4293 llvm_unreachable("Didn't expect to be comparing these operand types");
4294 }
4295}
4296
4298 const MachineOperand &MO) const {
4299 const MCInstrDesc &InstDesc = MI.getDesc();
4300 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4301
4302 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4303
4305 return true;
4306
4307 if (OpInfo.RegClass < 0)
4308 return false;
4309
4310 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4311 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4312 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4313 AMDGPU::OpName::src2))
4314 return false;
4315 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4316 }
4317
4318 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4319 return false;
4320
4321 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4322 return true;
4323
4324 return ST.hasVOP3Literal();
4325}
4326
4327bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4328 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4329 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4330 return false;
4331
4332 int Op32 = AMDGPU::getVOPe32(Opcode);
4333 if (Op32 == -1)
4334 return false;
4335
4336 return pseudoToMCOpcode(Op32) != -1;
4337}
4338
4339bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4340 // The src0_modifier operand is present on all instructions
4341 // that have modifiers.
4342
4343 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4344}
4345
4347 unsigned OpName) const {
4348 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4349 return Mods && Mods->getImm();
4350}
4351
4353 return any_of(ModifierOpNames,
4354 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4355}
4356
4358 const MachineRegisterInfo &MRI) const {
4359 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4360 // Can't shrink instruction with three operands.
4361 if (Src2) {
4362 switch (MI.getOpcode()) {
4363 default: return false;
4364
4365 case AMDGPU::V_ADDC_U32_e64:
4366 case AMDGPU::V_SUBB_U32_e64:
4367 case AMDGPU::V_SUBBREV_U32_e64: {
4368 const MachineOperand *Src1
4369 = getNamedOperand(MI, AMDGPU::OpName::src1);
4370 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4371 return false;
4372 // Additional verification is needed for sdst/src2.
4373 return true;
4374 }
4375 case AMDGPU::V_MAC_F16_e64:
4376 case AMDGPU::V_MAC_F32_e64:
4377 case AMDGPU::V_MAC_LEGACY_F32_e64:
4378 case AMDGPU::V_FMAC_F16_e64:
4379 case AMDGPU::V_FMAC_F16_t16_e64:
4380 case AMDGPU::V_FMAC_F32_e64:
4381 case AMDGPU::V_FMAC_F64_e64:
4382 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4383 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4384 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4385 return false;
4386 break;
4387
4388 case AMDGPU::V_CNDMASK_B32_e64:
4389 break;
4390 }
4391 }
4392
4393 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4394 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4395 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4396 return false;
4397
4398 // We don't need to check src0, all input types are legal, so just make sure
4399 // src0 isn't using any modifiers.
4400 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4401 return false;
4402
4403 // Can it be shrunk to a valid 32 bit opcode?
4404 if (!hasVALU32BitEncoding(MI.getOpcode()))
4405 return false;
4406
4407 // Check output modifiers
4408 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4409 !hasModifiersSet(MI, AMDGPU::OpName::clamp);
4410}
4411
4412// Set VCC operand with all flags from \p Orig, except for setting it as
4413// implicit.
4415 const MachineOperand &Orig) {
4416
4417 for (MachineOperand &Use : MI.implicit_operands()) {
4418 if (Use.isUse() &&
4419 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4420 Use.setIsUndef(Orig.isUndef());
4421 Use.setIsKill(Orig.isKill());
4422 return;
4423 }
4424 }
4425}
4426
4428 unsigned Op32) const {
4429 MachineBasicBlock *MBB = MI.getParent();
4430 MachineInstrBuilder Inst32 =
4431 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
4432 .setMIFlags(MI.getFlags());
4433
4434 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4435 // For VOPC instructions, this is replaced by an implicit def of vcc.
4436 if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::vdst)) {
4437 // dst
4438 Inst32.add(MI.getOperand(0));
4439 } else if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::sdst)) {
4440 // VOPCX instructions won't be writing to an explicit dst, so this should
4441 // not fail for these instructions.
4442 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
4443 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
4444 "Unexpected case");
4445 }
4446
4447 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
4448
4449 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4450 if (Src1)
4451 Inst32.add(*Src1);
4452
4453 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4454
4455 if (Src2) {
4456 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
4457 if (Op32Src2Idx != -1) {
4458 Inst32.add(*Src2);
4459 } else {
4460 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4461 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4462 // of vcc was already added during the initial BuildMI, but we
4463 // 1) may need to change vcc to vcc_lo to preserve the original register
4464 // 2) have to preserve the original flags.
4465 fixImplicitOperands(*Inst32);
4466 copyFlagsToImplicitVCC(*Inst32, *Src2);
4467 }
4468 }
4469
4470 return Inst32;
4471}
4472
4474 const MachineOperand &MO,
4475 const MCOperandInfo &OpInfo) const {
4476 // Literal constants use the constant bus.
4477 if (!MO.isReg())
4478 return !isInlineConstant(MO, OpInfo);
4479
4480 if (!MO.isUse())
4481 return false;
4482
4483 if (MO.getReg().isVirtual())
4484 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4485
4486 // Null is free
4487 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4488 return false;
4489
4490 // SGPRs use the constant bus
4491 if (MO.isImplicit()) {
4492 return MO.getReg() == AMDGPU::M0 ||
4493 MO.getReg() == AMDGPU::VCC ||
4494 MO.getReg() == AMDGPU::VCC_LO;
4495 } else {
4496 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4497 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4498 }
4499}
4500
4502 for (const MachineOperand &MO : MI.implicit_operands()) {
4503 // We only care about reads.
4504 if (MO.isDef())
4505 continue;
4506
4507 switch (MO.getReg()) {
4508 case AMDGPU::VCC:
4509 case AMDGPU::VCC_LO:
4510 case AMDGPU::VCC_HI:
4511 case AMDGPU::M0:
4512 case AMDGPU::FLAT_SCR:
4513 return MO.getReg();
4514
4515 default:
4516 break;
4517 }
4518 }
4519
4520 return Register();
4521}
4522
4523static bool shouldReadExec(const MachineInstr &MI) {
4524 if (SIInstrInfo::isVALU(MI)) {
4525 switch (MI.getOpcode()) {
4526 case AMDGPU::V_READLANE_B32:
4527 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4528 case AMDGPU::V_WRITELANE_B32:
4529 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4530 return false;
4531 }
4532
4533 return true;
4534 }
4535
4536 if (MI.isPreISelOpcode() ||
4537 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4540 return false;
4541
4542 return true;
4543}
4544
4545static bool isSubRegOf(const SIRegisterInfo &TRI,
4546 const MachineOperand &SuperVec,
4547 const MachineOperand &SubReg) {
4548 if (SubReg.getReg().isPhysical())
4549 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4550
4551 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4552 SubReg.getReg() == SuperVec.getReg();
4553}
4554
4556 StringRef &ErrInfo) const {
4557 uint16_t Opcode = MI.getOpcode();
4558 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4559 return true;
4560
4561 const MachineFunction *MF = MI.getParent()->getParent();
4562 const MachineRegisterInfo &MRI = MF->getRegInfo();
4563
4564 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4565 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4566 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4567 int Src3Idx = -1;
4568 if (Src0Idx == -1) {
4569 // VOPD V_DUAL_* instructions use different operand names.
4570 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4571 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4572 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4573 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4574 }
4575
4576 // Make sure the number of operands is correct.
4577 const MCInstrDesc &Desc = get(Opcode);
4578 if (!Desc.isVariadic() &&
4579 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4580 ErrInfo = "Instruction has wrong number of operands.";
4581 return false;
4582 }
4583
4584 if (MI.isInlineAsm()) {
4585 // Verify register classes for inlineasm constraints.
4586 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4587 I != E; ++I) {
4588 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4589 if (!RC)
4590 continue;
4591
4592 const MachineOperand &Op = MI.getOperand(I);
4593 if (!Op.isReg())
4594 continue;
4595
4596 Register Reg = Op.getReg();
4597 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4598 ErrInfo = "inlineasm operand has incorrect register class.";
4599 return false;
4600 }
4601 }
4602
4603 return true;
4604 }
4605
4606 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4607 ErrInfo = "missing memory operand from image instruction.";
4608 return false;
4609 }
4610
4611 // Make sure the register classes are correct.
4612 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4613 const MachineOperand &MO = MI.getOperand(i);
4614 if (MO.isFPImm()) {
4615 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4616 "all fp values to integers.";
4617 return false;
4618 }
4619
4620 int RegClass = Desc.operands()[i].RegClass;
4621
4622 switch (Desc.operands()[i].OperandType) {
4624 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4625 ErrInfo = "Illegal immediate value for operand.";
4626 return false;
4627 }
4628 break;
4633 break;
4645 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4646 ErrInfo = "Illegal immediate value for operand.";
4647 return false;
4648 }
4649 break;
4650 }
4652 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4653 ErrInfo = "Expected inline constant for operand.";
4654 return false;
4655 }
4656 break;
4659 // Check if this operand is an immediate.
4660 // FrameIndex operands will be replaced by immediates, so they are
4661 // allowed.
4662 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4663 ErrInfo = "Expected immediate, but got non-immediate";
4664 return false;
4665 }
4666 [[fallthrough]];
4667 default:
4668 continue;
4669 }
4670
4671 if (!MO.isReg())
4672 continue;
4673 Register Reg = MO.getReg();
4674 if (!Reg)
4675 continue;
4676
4677 // FIXME: Ideally we would have separate instruction definitions with the
4678 // aligned register constraint.
4679 // FIXME: We do not verify inline asm operands, but custom inline asm
4680 // verification is broken anyway
4681 if (ST.needsAlignedVGPRs()) {
4682 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4683 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4684 const TargetRegisterClass *SubRC =
4685 RI.getSubRegisterClass(RC, MO.getSubReg());
4686 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4687 if (RC)
4688 RC = SubRC;
4689 }
4690
4691 // Check that this is the aligned version of the class.
4692 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4693 ErrInfo = "Subtarget requires even aligned vector registers";
4694 return false;
4695 }
4696 }
4697
4698 if (RegClass != -1) {
4699 if (Reg.isVirtual())
4700 continue;
4701
4702 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4703 if (!RC->contains(Reg)) {
4704 ErrInfo = "Operand has incorrect register class.";
4705 return false;
4706 }
4707 }
4708 }
4709
4710 // Verify SDWA
4711 if (isSDWA(MI)) {
4712 if (!ST.hasSDWA()) {
4713 ErrInfo = "SDWA is not supported on this target";
4714 return false;
4715 }
4716
4717 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4718
4719 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4720 if (OpIdx == -1)
4721 continue;
4722 const MachineOperand &MO = MI.getOperand(OpIdx);
4723
4724 if (!ST.hasSDWAScalar()) {
4725 // Only VGPRS on VI
4726 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4727 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4728 return false;
4729 }
4730 } else {
4731 // No immediates on GFX9
4732 if (!MO.isReg()) {
4733 ErrInfo =
4734 "Only reg allowed as operands in SDWA instructions on GFX9+";
4735 return false;
4736 }
4737 }
4738 }
4739
4740 if (!ST.hasSDWAOmod()) {
4741 // No omod allowed on VI
4742 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4743 if (OMod != nullptr &&
4744 (!OMod->isImm() || OMod->getImm() != 0)) {
4745 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4746 return false;
4747 }
4748 }
4749
4750 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4751 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4752 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4753 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4754 const MachineOperand *Src0ModsMO =
4755 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4756 unsigned Mods = Src0ModsMO->getImm();
4757 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4758 Mods & SISrcMods::SEXT) {
4759 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4760 return false;
4761 }
4762 }
4763
4764 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4765 if (isVOPC(BasicOpcode)) {
4766 if (!ST.hasSDWASdst() && DstIdx != -1) {
4767 // Only vcc allowed as dst on VI for VOPC
4768 const MachineOperand &Dst = MI.getOperand(DstIdx);
4769 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4770 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4771 return false;
4772 }
4773 } else if (!ST.hasSDWAOutModsVOPC()) {
4774 // No clamp allowed on GFX9 for VOPC
4775 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4776 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4777 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4778 return false;
4779 }
4780
4781 // No omod allowed on GFX9 for VOPC
4782 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4783 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4784 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4785 return false;
4786 }
4787 }
4788 }
4789
4790 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4791 if (DstUnused && DstUnused->isImm() &&
4792 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4793 const MachineOperand &Dst = MI.getOperand(DstIdx);
4794 if (!Dst.isReg() || !Dst.isTied()) {
4795 ErrInfo = "Dst register should have tied register";
4796 return false;
4797 }
4798
4799 const MachineOperand &TiedMO =
4800 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4801 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4802 ErrInfo =
4803 "Dst register should be tied to implicit use of preserved register";
4804 return false;
4805 } else if (TiedMO.getReg().isPhysical() &&
4806 Dst.getReg() != TiedMO.getReg()) {
4807 ErrInfo = "Dst register should use same physical register as preserved";
4808 return false;
4809 }
4810 }
4811 }
4812
4813 // Verify MIMG / VIMAGE / VSAMPLE
4814 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4815 // Ensure that the return type used is large enough for all the options
4816 // being used TFE/LWE require an extra result register.
4817 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4818 if (DMask) {
4819 uint64_t DMaskImm = DMask->getImm();
4820 uint32_t RegCount =
4821 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
4822 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4823 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4824 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4825
4826 // Adjust for packed 16 bit values
4827 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4828 RegCount = divideCeil(RegCount, 2);
4829
4830 // Adjust if using LWE or TFE
4831 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4832 RegCount += 1;
4833
4834 const uint32_t DstIdx =
4835 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4836 const MachineOperand &Dst = MI.getOperand(DstIdx);
4837 if (Dst.isReg()) {
4838 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4839 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4840 if (RegCount > DstSize) {
4841 ErrInfo = "Image instruction returns too many registers for dst "
4842 "register class";
4843 return false;
4844 }
4845 }
4846 }
4847 }
4848
4849 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4850 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
4851 unsigned ConstantBusCount = 0;
4852 bool UsesLiteral = false;
4853 const MachineOperand *LiteralVal = nullptr;
4854
4855 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
4856 if (ImmIdx != -1) {
4857 ++ConstantBusCount;
4858 UsesLiteral = true;
4859 LiteralVal = &MI.getOperand(ImmIdx);
4860 }
4861
4862 SmallVector<Register, 2> SGPRsUsed;
4863 Register SGPRUsed;
4864
4865 // Only look at the true operands. Only a real operand can use the constant
4866 // bus, and we don't want to check pseudo-operands like the source modifier
4867 // flags.
4868 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
4869 if (OpIdx == -1)
4870 continue;
4871 const MachineOperand &MO = MI.getOperand(OpIdx);
4872 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4873 if (MO.isReg()) {
4874 SGPRUsed = MO.getReg();
4875 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
4876 ++ConstantBusCount;
4877 SGPRsUsed.push_back(SGPRUsed);
4878 }
4879 } else {
4880 if (!UsesLiteral) {
4881 ++ConstantBusCount;
4882 UsesLiteral = true;
4883 LiteralVal = &MO;
4884 } else if (!MO.isIdenticalTo(*LiteralVal)) {
4885 assert(isVOP2(MI) || isVOP3(MI));
4886 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
4887 return false;
4888 }
4889 }
4890 }
4891 }
4892
4893 SGPRUsed = findImplicitSGPRRead(MI);
4894 if (SGPRUsed) {
4895 // Implicit uses may safely overlap true operands
4896 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4897 return !RI.regsOverlap(SGPRUsed, SGPR);
4898 })) {
4899 ++ConstantBusCount;
4900 SGPRsUsed.push_back(SGPRUsed);
4901 }
4902 }
4903
4904 // v_writelane_b32 is an exception from constant bus restriction:
4905 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4906 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4907 Opcode != AMDGPU::V_WRITELANE_B32) {
4908 ErrInfo = "VOP* instruction violates constant bus restriction";
4909 return false;
4910 }
4911
4912 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4913 ErrInfo = "VOP3 instruction uses literal";
4914 return false;
4915 }
4916 }
4917
4918 // Special case for writelane - this can break the multiple constant bus rule,
4919 // but still can't use more than one SGPR register
4920 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4921 unsigned SGPRCount = 0;
4922 Register SGPRUsed;
4923
4924 for (int OpIdx : {Src0Idx, Src1Idx}) {
4925 if (OpIdx == -1)
4926 break;
4927
4928 const MachineOperand &MO = MI.getOperand(OpIdx);
4929
4930 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4931 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4932 if (MO.getReg() != SGPRUsed)
4933 ++SGPRCount;
4934 SGPRUsed = MO.getReg();
4935 }
4936 }
4937 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
4938 ErrInfo = "WRITELANE instruction violates constant bus restriction";
4939 return false;
4940 }
4941 }
4942 }
4943
4944 // Verify misc. restrictions on specific instructions.
4945 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
4946 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
4947 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4948 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
4949 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
4950 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
4951 if (!compareMachineOp(Src0, Src1) &&
4952 !compareMachineOp(Src0, Src2)) {
4953 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
4954 return false;
4955 }
4956 }
4957 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
4958 SISrcMods::ABS) ||
4959 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
4960 SISrcMods::ABS) ||
4961 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
4962 SISrcMods::ABS)) {
4963 ErrInfo = "ABS not allowed in VOP3B instructions";
4964 return false;
4965 }
4966 }
4967
4968 if (isSOP2(MI) || isSOPC(MI)) {
4969 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4970 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
4971
4972 if (!Src0.isReg() && !Src1.isReg() &&
4973 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
4974 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
4975 !Src0.isIdenticalTo(Src1)) {
4976 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
4977 return false;
4978 }
4979 }
4980
4981 if (isSOPK(MI)) {
4982 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
4983 if (Desc.isBranch()) {
4984 if (!Op->isMBB()) {
4985 ErrInfo = "invalid branch target for SOPK instruction";
4986 return false;
4987 }
4988 } else {
4989 uint64_t Imm = Op->getImm();
4990 if (sopkIsZext(Opcode)) {
4991 if (!isUInt<16>(Imm)) {
4992 ErrInfo = "invalid immediate for SOPK instruction";
4993 return false;
4994 }
4995 } else {
4996 if (!isInt<16>(Imm)) {
4997 ErrInfo = "invalid immediate for SOPK instruction";
4998 return false;
4999 }
5000 }
5001 }
5002 }
5003
5004 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5005 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5006 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5007 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5008 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5009 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5010
5011 const unsigned StaticNumOps =
5012 Desc.getNumOperands() + Desc.implicit_uses().size();
5013 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5014
5015 // Allow additional implicit operands. This allows a fixup done by the post
5016 // RA scheduler where the main implicit operand is killed and implicit-defs
5017 // are added for sub-registers that remain live after this instruction.
5018 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5019 ErrInfo = "missing implicit register operands";
5020 return false;
5021 }
5022
5023 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5024 if (IsDst) {
5025 if (!Dst->isUse()) {
5026 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5027 return false;
5028 }
5029
5030 unsigned UseOpIdx;
5031 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5032 UseOpIdx != StaticNumOps + 1) {
5033 ErrInfo = "movrel implicit operands should be tied";
5034 return false;
5035 }
5036 }
5037
5038 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5039 const MachineOperand &ImpUse
5040 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5041 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5042 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5043 ErrInfo = "src0 should be subreg of implicit vector use";
5044 return false;
5045 }
5046 }
5047
5048 // Make sure we aren't losing exec uses in the td files. This mostly requires
5049 // being careful when using let Uses to try to add other use registers.
5050 if (shouldReadExec(MI)) {
5051 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5052 ErrInfo = "VALU instruction does not implicitly read exec mask";
5053 return false;
5054 }
5055 }
5056
5057 if (isSMRD(MI)) {
5058 if (MI.mayStore() &&
5060 // The register offset form of scalar stores may only use m0 as the
5061 // soffset register.
5062 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5063 if (Soff && Soff->getReg() != AMDGPU::M0) {
5064 ErrInfo = "scalar stores must use m0 as offset register";
5065 return false;
5066 }
5067 }
5068 }
5069
5070 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5071 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5072 if (Offset->getImm() != 0) {
5073 ErrInfo = "subtarget does not support offsets in flat instructions";
5074 return false;
5075 }
5076 }
5077
5078 if (isDS(MI) && !ST.hasGDS()) {
5079 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5080 if (GDSOp && GDSOp->getImm() != 0) {
5081 ErrInfo = "GDS is not supported on this subtarget";
5082 return false;
5083 }
5084 }
5085
5086 if (isImage(MI)) {
5087 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5088 if (DimOp) {
5089 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5090 AMDGPU::OpName::vaddr0);
5091 int RSrcOpName =
5092 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5093 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5094 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5095 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5097 const AMDGPU::MIMGDimInfo *Dim =
5099
5100 if (!Dim) {
5101 ErrInfo = "dim is out of range";
5102 return false;
5103 }
5104
5105 bool IsA16 = false;
5106 if (ST.hasR128A16()) {
5107 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5108 IsA16 = R128A16->getImm() != 0;
5109 } else if (ST.hasA16()) {
5110 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5111 IsA16 = A16->getImm() != 0;
5112 }
5113
5114 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5115
5116 unsigned AddrWords =
5117 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5118
5119 unsigned VAddrWords;
5120 if (IsNSA) {
5121 VAddrWords = RsrcIdx - VAddr0Idx;
5122 if (ST.hasPartialNSAEncoding() &&
5123 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5124 unsigned LastVAddrIdx = RsrcIdx - 1;
5125 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5126 }
5127 } else {
5128 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5129 if (AddrWords > 12)
5130 AddrWords = 16;
5131 }
5132
5133 if (VAddrWords != AddrWords) {
5134 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5135 << " but got " << VAddrWords << "\n");
5136 ErrInfo = "bad vaddr size";
5137 return false;
5138 }
5139 }
5140 }
5141
5142 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5143 if (DppCt) {
5144 using namespace AMDGPU::DPP;
5145
5146 unsigned DC = DppCt->getImm();
5147 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5148 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5149 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5150 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5151 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5152 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5153 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5154 ErrInfo = "Invalid dpp_ctrl value";
5155 return false;
5156 }
5157 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5159 ErrInfo = "Invalid dpp_ctrl value: "
5160 "wavefront shifts are not supported on GFX10+";
5161 return false;
5162 }
5163 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5165 ErrInfo = "Invalid dpp_ctrl value: "
5166 "broadcasts are not supported on GFX10+";
5167 return false;
5168 }
5169 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5171 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5172 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5173 !ST.hasGFX90AInsts()) {
5174 ErrInfo = "Invalid dpp_ctrl value: "
5175 "row_newbroadcast/row_share is not supported before "
5176 "GFX90A/GFX10";
5177 return false;
5178 } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5179 ErrInfo = "Invalid dpp_ctrl value: "
5180 "row_share and row_xmask are not supported before GFX10";
5181 return false;
5182 }
5183 }
5184
5185 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5187 ErrInfo = "Invalid dpp_ctrl value: "
5188 "DP ALU dpp only support row_newbcast";
5189 return false;
5190 }
5191 }
5192
5193 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5194 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5195 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5196 : AMDGPU::OpName::vdata;
5197 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5198 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5199 if (Data && !Data->isReg())
5200 Data = nullptr;
5201
5202 if (ST.hasGFX90AInsts()) {
5203 if (Dst && Data &&
5204 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5205 ErrInfo = "Invalid register class: "
5206 "vdata and vdst should be both VGPR or AGPR";
5207 return false;
5208 }
5209 if (Data && Data2 &&
5210 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5211 ErrInfo = "Invalid register class: "
5212 "both data operands should be VGPR or AGPR";
5213 return false;
5214 }
5215 } else {
5216 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5217 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5218 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5219 ErrInfo = "Invalid register class: "
5220 "agpr loads and stores not supported on this GPU";
5221 return false;
5222 }
5223 }
5224 }
5225
5226 if (ST.needsAlignedVGPRs()) {
5227 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5229 if (!Op)
5230 return true;
5231 Register Reg = Op->getReg();
5232 if (Reg.isPhysical())
5233 return !(RI.getHWRegIndex(Reg) & 1);
5234 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5235 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5236 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5237 };
5238
5239 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5240 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5241 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5242
5243 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5244 ErrInfo = "Subtarget requires even aligned vector registers "
5245 "for DS_GWS instructions";
5246 return false;
5247 }
5248 }
5249
5250 if (isMIMG(MI)) {
5251 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5252 ErrInfo = "Subtarget requires even aligned vector registers "
5253 "for vaddr operand of image instructions";
5254 return false;
5255 }
5256 }
5257 }
5258
5259 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5260 !ST.hasGFX90AInsts()) {
5261 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5262 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5263 ErrInfo = "Invalid register class: "
5264 "v_accvgpr_write with an SGPR is not supported on this GPU";
5265 return false;
5266 }
5267 }
5268
5269 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5270 const MachineOperand &SrcOp = MI.getOperand(1);
5271 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5272 ErrInfo = "pseudo expects only physical SGPRs";
5273 return false;
5274 }
5275 }
5276
5277 return true;
5278}
5279
5280// It is more readable to list mapped opcodes on the same line.
5281// clang-format off
5282
5284 switch (MI.getOpcode()) {
5285 default: return AMDGPU::INSTRUCTION_LIST_END;
5286 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5287 case AMDGPU::COPY: return AMDGPU::COPY;
5288 case AMDGPU::PHI: return AMDGPU::PHI;
5289 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5290 case AMDGPU::WQM: return AMDGPU::WQM;
5291 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5292 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5293 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5294 case AMDGPU::S_MOV_B32: {
5295 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5296 return MI.getOperand(1).isReg() ||
5297 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5298 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5299 }
5300 case AMDGPU::S_ADD_I32:
5301 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5302 case AMDGPU::S_ADDC_U32:
5303 return AMDGPU::V_ADDC_U32_e32;
5304 case AMDGPU::S_SUB_I32:
5305 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5306 // FIXME: These are not consistently handled, and selected when the carry is
5307 // used.
5308 case AMDGPU::S_ADD_U32:
5309 return AMDGPU::V_ADD_CO_U32_e32;
5310 case AMDGPU::S_SUB_U32:
5311 return AMDGPU::V_SUB_CO_U32_e32;
5312 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5313 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5314 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5315 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5316 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5317 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5318 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5319 case AMDGPU::S_XNOR_B32:
5320 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5321 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5322 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5323 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5324 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5325 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5326 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5327 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5328 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5329 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5330 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5331 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5332 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5333 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5334 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5335 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5336 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5337 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5338 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5339 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5340 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5341 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5342 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5343 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5344 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5345 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5346 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5347 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5348 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5349 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5350 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5351 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5352 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5353 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5354 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5355 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5356 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5357 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5358 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5359 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5360 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5361 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5362 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5363 case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5364 case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5365 case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64;
5366 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5367 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5368 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5369 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5370 case AMDGPU::S_CEIL_F16:
5371 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5372 : AMDGPU::V_CEIL_F16_fake16_e64;
5373 case AMDGPU::S_FLOOR_F16:
5374 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5375 : AMDGPU::V_FLOOR_F16_fake16_e64;
5376 case AMDGPU::S_TRUNC_F16:
5377 return AMDGPU::V_TRUNC_F16_fake16_e64;
5378 case AMDGPU::S_RNDNE_F16:
5379 return AMDGPU::V_RNDNE_F16_fake16_e64;
5380 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5381 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5382 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5383 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5384 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5385 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5386 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5387 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5388 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5389 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5390 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5391 case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5392 case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5393 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5394 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5395 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5396 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
5397 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5398 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5399 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5400 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5401 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5402 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5403 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5404 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5405 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5406 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5407 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5408 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5409 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5410 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5411 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5412 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5413 case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64;
5414 case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64;
5415 case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64;
5416 case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64;
5417 case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64;
5418 case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64;
5419 case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64;
5420 case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64;
5421 case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64;
5422 case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64;
5423 case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64;
5424 case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
5425 case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
5426 case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
5427 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5428 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5429 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5430 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5431 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5432 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5433 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5434 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5435 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5436 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5437 }
5439 "Unexpected scalar opcode without corresponding vector one!");
5440}
5441
5442// clang-format on
5443
5447 const DebugLoc &DL, Register Reg,
5448 bool IsSCCLive,
5449 SlotIndexes *Indexes) const {
5450 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5451 const SIInstrInfo *TII = ST.getInstrInfo();
5452 bool IsWave32 = ST.isWave32();
5453 if (IsSCCLive) {
5454 // Insert two move instructions, one to save the original value of EXEC and
5455 // the other to turn on all bits in EXEC. This is required as we can't use
5456 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5457 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5458 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5459 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5460 .addReg(Exec, RegState::Kill);
5461 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5462 if (Indexes) {
5463 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5464 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5465 }
5466 } else {
5467 const unsigned OrSaveExec =
5468 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5469 auto SaveExec =
5470 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5471 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5472 if (Indexes)
5473 Indexes->insertMachineInstrInMaps(*SaveExec);
5474 }
5475}
5476
5479 const DebugLoc &DL, Register Reg,
5480 SlotIndexes *Indexes) const {
5481 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5482 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5483 auto ExecRestoreMI =
5484 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5485 if (Indexes)
5486 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5487}
5488
5489static const TargetRegisterClass *
5491 const MachineRegisterInfo &MRI,
5492 const MCInstrDesc &TID, unsigned RCID,
5493 bool IsAllocatable) {
5494 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5495 (((TID.mayLoad() || TID.mayStore()) &&
5496 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5498 switch (RCID) {
5499 case AMDGPU::AV_32RegClassID:
5500 RCID = AMDGPU::VGPR_32RegClassID;
5501 break;
5502 case AMDGPU::AV_64RegClassID:
5503 RCID = AMDGPU::VReg_64RegClassID;
5504 break;
5505 case AMDGPU::AV_96RegClassID:
5506 RCID = AMDGPU::VReg_96RegClassID;
5507 break;
5508 case AMDGPU::AV_128RegClassID:
5509 RCID = AMDGPU::VReg_128RegClassID;
5510 break;
5511 case AMDGPU::AV_160RegClassID:
5512 RCID = AMDGPU::VReg_160RegClassID;
5513 break;
5514 case AMDGPU::AV_512RegClassID:
5515 RCID = AMDGPU::VReg_512RegClassID;
5516 break;
5517 default:
5518 break;
5519 }
5520 }
5521
5522 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5523}
5524
5526 unsigned OpNum, const TargetRegisterInfo *TRI,
5527 const MachineFunction &MF)
5528 const {
5529 if (OpNum >= TID.getNumOperands())
5530 return nullptr;
5531 auto RegClass = TID.operands()[OpNum].RegClass;
5532 bool IsAllocatable = false;
5534 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5535 // with two data operands. Request register class constrained to VGPR only
5536 // of both operands present as Machine Copy Propagation can not check this
5537 // constraint and possibly other passes too.
5538 //
5539 // The check is limited to FLAT and DS because atomics in non-flat encoding
5540 // have their vdst and vdata tied to be the same register.
5541 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5542 AMDGPU::OpName::vdst);
5543 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5544 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5545 : AMDGPU::OpName::vdata);
5546 if (DataIdx != -1) {
5547 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5548 TID.Opcode, AMDGPU::OpName::data1);
5549 }
5550 }
5551 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5552 IsAllocatable);
5553}
5554
5556 unsigned OpNo) const {
5557 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5558 const MCInstrDesc &Desc = get(MI.getOpcode());
5559 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5560 Desc.operands()[OpNo].RegClass == -1) {
5561 Register Reg = MI.getOperand(OpNo).getReg();
5562
5563 if (Reg.isVirtual())
5564 return MRI.getRegClass(Reg);
5565 return RI.getPhysRegBaseClass(Reg);
5566 }
5567
5568 unsigned RCID = Desc.operands()[OpNo].RegClass;
5569 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5570}
5571
5574 MachineBasicBlock *MBB = MI.getParent();
5575 MachineOperand &MO = MI.getOperand(OpIdx);
5577 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5578 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5579 unsigned Size = RI.getRegSizeInBits(*RC);
5580 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
5581 if (MO.isReg())
5582 Opcode = AMDGPU::COPY;
5583 else if (RI.isSGPRClass(RC))
5584 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5585
5586 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5587 Register Reg = MRI.createVirtualRegister(VRC);
5589 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5590 MO.ChangeToRegister(Reg, false);
5591}
5592
5595 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5596 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5597 MachineBasicBlock *MBB = MI->getParent();
5598 DebugLoc DL = MI->getDebugLoc();
5599 Register SubReg = MRI.createVirtualRegister(SubRC);
5600
5601 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
5602 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5603 .addReg(SuperReg.getReg(), 0, SubIdx);
5604 return SubReg;
5605 }
5606
5607 // Just in case the super register is itself a sub-register, copy it to a new
5608 // value so we don't need to worry about merging its subreg index with the
5609 // SubIdx passed to this function. The register coalescer should be able to
5610 // eliminate this extra copy.
5611 Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
5612
5613 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
5614 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
5615
5616 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5617 .addReg(NewSuperReg, 0, SubIdx);
5618
5619 return SubReg;
5620}
5621
5624 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5625 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5626 if (Op.isImm()) {
5627 if (SubIdx == AMDGPU::sub0)
5628 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5629 if (SubIdx == AMDGPU::sub1)
5630 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5631
5632 llvm_unreachable("Unhandled register index for immediate");
5633 }
5634
5635 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5636 SubIdx, SubRC);
5637 return MachineOperand::CreateReg(SubReg, false);
5638}
5639
5640// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5641void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5642 assert(Inst.getNumExplicitOperands() == 3);
5643 MachineOperand Op1 = Inst.getOperand(1);
5644 Inst.removeOperand(1);
5645 Inst.addOperand(Op1);
5646}
5647
5649 const MCOperandInfo &OpInfo,
5650 const MachineOperand &MO) const {
5651 if (!MO.isReg())
5652 return false;
5653
5654 Register Reg = MO.getReg();
5655
5656 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5657 if (Reg.isPhysical())
5658 return DRC->contains(Reg);
5659
5660 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5661
5662 if (MO.getSubReg()) {
5663 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5664 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5665 if (!SuperRC)
5666 return false;
5667
5668 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5669 if (!DRC)
5670 return false;
5671 }
5672 return RC->hasSuperClassEq(DRC);
5673}
5674
5676 const MCOperandInfo &OpInfo,
5677 const MachineOperand &MO) const {
5678 if (MO.isReg())
5679 return isLegalRegOperand(MRI, OpInfo, MO);
5680
5681 // Handle non-register types that are treated like immediates.
5682 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5683 return true;
5684}
5685
5686bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5687 const MachineOperand *MO) const {
5688 const MachineFunction &MF = *MI.getParent()->getParent();
5689 const MachineRegisterInfo &MRI = MF.getRegInfo();
5690 const MCInstrDesc &InstDesc = MI.getDesc();
5691 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5692 const TargetRegisterClass *DefinedRC =
5693 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5694 if (!MO)
5695 MO = &MI.getOperand(OpIdx);
5696
5697 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5698 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5699 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5700 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5701 return false;
5702
5704 if (MO->isReg())
5705 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5706
5707 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5708 if (i == OpIdx)
5709 continue;
5710 const MachineOperand &Op = MI.getOperand(i);
5711 if (Op.isReg()) {
5712 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5713 if (!SGPRsUsed.count(SGPR) &&
5714 // FIXME: This can access off the end of the operands() array.
5715 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5716 if (--ConstantBusLimit <= 0)
5717 return false;
5718 SGPRsUsed.insert(SGPR);
5719 }
5720 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5721 !isInlineConstant(Op, InstDesc.operands()[i])) {
5722 if (!LiteralLimit--)
5723 return false;
5724 if (--ConstantBusLimit <= 0)
5725 return false;
5726 }
5727 }
5728 }
5729
5730 if (MO->isReg()) {
5731 if (!DefinedRC)
5732 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5733 if (!isLegalRegOperand(MRI, OpInfo, *MO))
5734 return false;
5735 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5736 if (IsAGPR && !ST.hasMAIInsts())
5737 return false;
5738 unsigned Opc = MI.getOpcode();
5739 if (IsAGPR &&
5740 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5741 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5742 return false;
5743 // Atomics should have both vdst and vdata either vgpr or agpr.
5744 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5745 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
5746 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5747 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5748 MI.getOperand(DataIdx).isReg() &&
5749 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5750 return false;
5751 if ((int)OpIdx == DataIdx) {
5752 if (VDstIdx != -1 &&
5753 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5754 return false;
5755 // DS instructions with 2 src operands also must have tied RC.
5756 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5757 AMDGPU::OpName::data1);
5758 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5759 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5760 return false;
5761 }
5762 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5763 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5764 RI.isSGPRReg(MRI, MO->getReg()))
5765 return false;
5766 return true;
5767 }
5768
5769 if (MO->isImm()) {
5770 uint64_t Imm = MO->getImm();
5771 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5772 bool Is64BitOp = Is64BitFPOp ||
5776 if (Is64BitOp &&
5778 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
5779 return false;
5780
5781 // FIXME: We can use sign extended 64-bit literals, but only for signed
5782 // operands. At the moment we do not know if an operand is signed.
5783 // Such operand will be encoded as its low 32 bits and then either
5784 // correctly sign extended or incorrectly zero extended by HW.
5785 if (!Is64BitFPOp && (int32_t)Imm < 0)
5786 return false;
5787 }
5788 }
5789
5790 // Handle non-register types that are treated like immediates.
5791 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
5792
5793 if (!DefinedRC) {
5794 // This operand expects an immediate.
5795 return true;
5796 }
5797
5798 return isImmOperandLegal(MI, OpIdx, *MO);
5799}
5800
5802 MachineInstr &MI) const {
5803 unsigned Opc = MI.getOpcode();
5804 const MCInstrDesc &InstrDesc = get(Opc);
5805
5806 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5807 MachineOperand &Src0 = MI.getOperand(Src0Idx);
5808
5809 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5810 MachineOperand &Src1 = MI.getOperand(Src1Idx);
5811
5812 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5813 // we need to only have one constant bus use before GFX10.
5814 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
5815 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
5816 RI.isSGPRReg(MRI, Src0.getReg()))
5817 legalizeOpWithMove(MI, Src0Idx);
5818
5819 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5820 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5821 // src0/src1 with V_READFIRSTLANE.
5822 if (Opc == AMDGPU::V_WRITELANE_B32) {
5823 const DebugLoc &DL = MI.getDebugLoc();
5824 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
5825 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5826 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5827 .add(Src0);
5828 Src0.ChangeToRegister(Reg, false);
5829 }
5830 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
5831 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5832 const DebugLoc &DL = MI.getDebugLoc();
5833 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5834 .add(Src1);
5835 Src1.ChangeToRegister(Reg, false);
5836 }
5837 return;
5838 }
5839
5840 // No VOP2 instructions support AGPRs.
5841 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
5842 legalizeOpWithMove(MI, Src0Idx);
5843
5844 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
5845 legalizeOpWithMove(MI, Src1Idx);
5846
5847 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
5848 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
5849 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5850 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
5851 legalizeOpWithMove(MI, Src2Idx);
5852 }
5853
5854 // VOP2 src0 instructions support all operand types, so we don't need to check
5855 // their legality. If src1 is already legal, we don't need to do anything.
5856 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
5857 return;
5858
5859 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
5860 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
5861 // select is uniform.
5862 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
5863 RI.isVGPR(MRI, Src1.getReg())) {
5864 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5865 const DebugLoc &DL = MI.getDebugLoc();
5866 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5867 .add(Src1);
5868 Src1.ChangeToRegister(Reg, false);
5869 return;
5870 }
5871
5872 // We do not use commuteInstruction here because it is too aggressive and will
5873 // commute if it is possible. We only want to commute here if it improves
5874 // legality. This can be called a fairly large number of times so don't waste
5875 // compile time pointlessly swapping and checking legality again.
5876 if (HasImplicitSGPR || !MI.isCommutable()) {
5877 legalizeOpWithMove(MI, Src1Idx);
5878 return;
5879 }
5880
5881 // If src0 can be used as src1, commuting will make the operands legal.
5882 // Otherwise we have to give up and insert a move.
5883 //
5884 // TODO: Other immediate-like operand kinds could be commuted if there was a
5885 // MachineOperand::ChangeTo* for them.
5886 if ((!Src1.isImm() && !Src1.isReg()) ||
5887 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
5888 legalizeOpWithMove(MI, Src1Idx);
5889 return;
5890 }
5891
5892 int CommutedOpc = commuteOpcode(MI);
5893 if (CommutedOpc == -1) {
5894 legalizeOpWithMove(MI, Src1Idx);
5895 return;
5896 }
5897
5898 MI.setDesc(get(CommutedOpc));
5899
5900 Register Src0Reg = Src0.getReg();
5901 unsigned Src0SubReg = Src0.getSubReg();
5902 bool Src0Kill = Src0.isKill();
5903
5904 if (Src1.isImm())
5905 Src0.ChangeToImmediate(Src1.getImm());
5906 else if (Src1.isReg()) {
5907 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
5908 Src0.setSubReg(Src1.getSubReg());
5909 } else
5910 llvm_unreachable("Should only have register or immediate operands");
5911
5912 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
5913 Src1.setSubReg(Src0SubReg);
5915}
5916
5917// Legalize VOP3 operands. All operand types are supported for any operand
5918// but only one literal constant and only starting from GFX10.
5920 MachineInstr &MI) const {
5921 unsigned Opc = MI.getOpcode();
5922
5923 int VOP3Idx[3] = {
5924 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
5925 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
5926 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
5927 };
5928
5929 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
5930 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
5931 // src1 and src2 must be scalar
5932 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
5933 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
5934 const DebugLoc &DL = MI.getDebugLoc();
5935 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
5936 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5937 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5938 .add(Src1);
5939 Src1.ChangeToRegister(Reg, false);
5940 }
5941 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
5942 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5943 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5944 .add(Src2);
5945 Src2.ChangeToRegister(Reg, false);
5946 }
5947 }
5948
5949 // Find the one SGPR operand we are allowed to use.
5950 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
5951 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
5952 SmallDenseSet<unsigned> SGPRsUsed;
5953 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
5954 if (SGPRReg) {
5955 SGPRsUsed.insert(SGPRReg);
5956 --ConstantBusLimit;
5957 }
5958
5959 for (int Idx : VOP3Idx) {
5960 if (Idx == -1)
5961 break;
5962 MachineOperand &MO = MI.getOperand(Idx);
5963
5964 if (!MO.isReg()) {
5965 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
5966 continue;
5967
5968 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
5969 --LiteralLimit;
5970 --ConstantBusLimit;
5971 continue;
5972 }
5973
5974 --LiteralLimit;
5975 --ConstantBusLimit;
5977 continue;
5978 }
5979
5980 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
5981 !isOperandLegal(MI, Idx, &MO)) {
5983 continue;
5984 }
5985
5986 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
5987 continue; // VGPRs are legal
5988
5989 // We can use one SGPR in each VOP3 instruction prior to GFX10
5990 // and two starting from GFX10.
5991 if (SGPRsUsed.count(MO.getReg()))
5992 continue;
5993 if (ConstantBusLimit > 0) {
5994 SGPRsUsed.insert(MO.getReg());
5995 --ConstantBusLimit;
5996 continue;
5997 }
5998
5999 // If we make it this far, then the operand is not legal and we must
6000 // legalize it.
6002 }
6003
6004 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6005 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6006 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6007 legalizeOpWithMove(MI, VOP3Idx[2]);
6008}
6009
6011 MachineRegisterInfo &MRI) const {
6012 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6013 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6014 Register DstReg = MRI.createVirtualRegister(SRC);
6015 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6016
6017 if (RI.hasAGPRs(VRC)) {
6018 VRC = RI.getEquivalentVGPRClass(VRC);
6019 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6020 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6021 get(TargetOpcode::COPY), NewSrcReg)
6022 .addReg(SrcReg);
6023 SrcReg = NewSrcReg;
6024 }
6025
6026 if (SubRegs == 1) {
6027 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6028 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6029 .addReg(SrcReg);
6030 return DstReg;
6031 }
6032
6034 for (unsigned i = 0; i < SubRegs; ++i) {
6035 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6036 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6037 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6038 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6039 SRegs.push_back(SGPR);
6040 }
6041
6043 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6044 get(AMDGPU::REG_SEQUENCE), DstReg);
6045 for (unsigned i = 0; i < SubRegs; ++i) {
6046 MIB.addReg(SRegs[i]);
6047 MIB.addImm(RI.getSubRegFromChannel(i));
6048 }
6049 return DstReg;
6050}
6051
6053 MachineInstr &MI) const {
6054
6055 // If the pointer is store in VGPRs, then we need to move them to
6056 // SGPRs using v_readfirstlane. This is safe because we only select
6057 // loads with uniform pointers to SMRD instruction so we know the
6058 // pointer value is uniform.
6059 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6060 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6061 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6062 SBase->setReg(SGPR);
6063 }
6064 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6065 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
6066 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6067 SOff->setReg(SGPR);
6068 }
6069}
6070
6072 unsigned Opc = Inst.getOpcode();
6073 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6074 if (OldSAddrIdx < 0)
6075 return false;
6076
6078
6079 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6080 if (NewOpc < 0)
6082 if (NewOpc < 0)
6083 return false;
6084
6086 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6087 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6088 return false;
6089
6090 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6091 if (NewVAddrIdx < 0)
6092 return false;
6093
6094 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6095
6096 // Check vaddr, it shall be zero or absent.
6097 MachineInstr *VAddrDef = nullptr;
6098 if (OldVAddrIdx >= 0) {
6099 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6100 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6101 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6102 !VAddrDef->getOperand(1).isImm() ||
6103 VAddrDef->getOperand(1).getImm() != 0)
6104 return false;
6105 }
6106
6107 const MCInstrDesc &NewDesc = get(NewOpc);
6108 Inst.setDesc(NewDesc);
6109
6110 // Callers expect iterator to be valid after this call, so modify the
6111 // instruction in place.
6112 if (OldVAddrIdx == NewVAddrIdx) {
6113 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6114 // Clear use list from the old vaddr holding a zero register.
6115 MRI.removeRegOperandFromUseList(&NewVAddr);
6116 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6117 Inst.removeOperand(OldSAddrIdx);
6118 // Update the use list with the pointer we have just moved from vaddr to
6119 // saddr position. Otherwise new vaddr will be missing from the use list.
6120 MRI.removeRegOperandFromUseList(&NewVAddr);
6121 MRI.addRegOperandToUseList(&NewVAddr);
6122 } else {
6123 assert(OldSAddrIdx == NewVAddrIdx);
6124
6125 if (OldVAddrIdx >= 0) {
6126 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6127 AMDGPU::OpName::vdst_in);
6128
6129 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6130 // it asserts. Untie the operands for now and retie them afterwards.
6131 if (NewVDstIn != -1) {
6132 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6133 Inst.untieRegOperand(OldVDstIn);
6134 }
6135
6136 Inst.removeOperand(OldVAddrIdx);
6137
6138 if (NewVDstIn != -1) {
6139 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6140 Inst.tieOperands(NewVDst, NewVDstIn);
6141 }
6142 }
6143 }
6144
6145 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6146 VAddrDef->eraseFromParent();
6147
6148 return true;
6149}
6150
6151// FIXME: Remove this when SelectionDAG is obsoleted.
6153 MachineInstr &MI) const {
6155 return;
6156
6157 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6158 // thinks they are uniform, so a readfirstlane should be valid.
6159 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6160 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6161 return;
6162
6164 return;
6165
6166 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
6167 SAddr->setReg(ToSGPR);
6168}
6169
6172 const TargetRegisterClass *DstRC,
6175 const DebugLoc &DL) const {
6176 Register OpReg = Op.getReg();
6177 unsigned OpSubReg = Op.getSubReg();
6178
6179 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6180 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6181
6182 // Check if operand is already the correct register class.
6183 if (DstRC == OpRC)
6184 return;
6185
6186 Register DstReg = MRI.createVirtualRegister(DstRC);
6187 auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
6188
6189 Op.setReg(DstReg);
6190 Op.setSubReg(0);
6191
6192 MachineInstr *Def = MRI.getVRegDef(OpReg);
6193 if (!Def)
6194 return;
6195
6196 // Try to eliminate the copy if it is copying an immediate value.
6197 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6198 foldImmediate(*Copy, *Def, OpReg, &MRI);
6199
6200 bool ImpDef = Def->isImplicitDef();
6201 while (!ImpDef && Def && Def->isCopy()) {
6202 if (Def->getOperand(1).getReg().isPhysical())
6203 break;
6204 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6205 ImpDef = Def && Def->isImplicitDef();
6206 }
6207 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6208 !ImpDef)
6209 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6210}
6211
6212// Emit the actual waterfall loop, executing the wrapped instruction for each
6213// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6214// iteration, in the worst case we execute 64 (once per lane).
6217 MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL,
6218 ArrayRef<MachineOperand *> ScalarOps) {
6219 MachineFunction &MF = *OrigBB.getParent();
6220 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6221 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6222 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6223 unsigned SaveExecOpc =
6224 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6225 unsigned XorTermOpc =
6226 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6227 unsigned AndOpc =
6228 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6229 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6230
6232
6233 SmallVector<Register, 8> ReadlanePieces;
6234 Register CondReg;
6235
6236 for (MachineOperand *ScalarOp : ScalarOps) {
6237 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6238 unsigned NumSubRegs = RegSize / 32;
6239 Register VScalarOp = ScalarOp->getReg();
6240
6241 if (NumSubRegs == 1) {
6242 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6243
6244 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6245 .addReg(VScalarOp);
6246
6247 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6248
6249 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6250 .addReg(CurReg)
6251 .addReg(VScalarOp);
6252
6253 // Combine the comparison results with AND.
6254 if (!CondReg) // First.
6255 CondReg = NewCondReg;
6256 else { // If not the first, we create an AND.
6257 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6258 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6259 .addReg(CondReg)
6260 .addReg(NewCondReg);
6261 CondReg = AndReg;
6262 }
6263
6264 // Update ScalarOp operand to use the SGPR ScalarOp.
6265 ScalarOp->setReg(CurReg);
6266 ScalarOp->setIsKill();
6267 } else {
6268 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6269 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6270 "Unhandled register size");
6271
6272 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6273 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6274 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6275
6276 // Read the next variant <- also loop target.
6277 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6278 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6279
6280 // Read the next variant <- also loop target.
6281 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6282 .addReg(VScalarOp, VScalarOpUndef,
6283 TRI->getSubRegFromChannel(Idx + 1));
6284
6285 ReadlanePieces.push_back(CurRegLo);
6286 ReadlanePieces.push_back(CurRegHi);
6287
6288 // Comparison is to be done as 64-bit.
6289 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6290 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6291 .addReg(CurRegLo)
6292 .addImm(AMDGPU::sub0)
6293 .addReg(CurRegHi)
6294 .addImm(AMDGPU::sub1);
6295
6296 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6297 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6298 NewCondReg)
6299 .addReg(CurReg);
6300 if (NumSubRegs <= 2)
6301 Cmp.addReg(VScalarOp);
6302 else
6303 Cmp.addReg(VScalarOp, VScalarOpUndef,
6304 TRI->getSubRegFromChannel(Idx, 2));
6305
6306 // Combine the comparison results with AND.
6307 if (!CondReg) // First.
6308 CondReg = NewCondReg;
6309 else { // If not the first, we create an AND.
6310 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6311 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6312 .addReg(CondReg)
6313 .addReg(NewCondReg);
6314 CondReg = AndReg;
6315 }
6316 } // End for loop.
6317
6318 auto SScalarOpRC =
6319 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6320 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6321
6322 // Build scalar ScalarOp.
6323 auto Merge =
6324 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6325 unsigned Channel = 0;
6326 for (Register Piece : ReadlanePieces) {
6327 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6328 }
6329
6330 // Update ScalarOp operand to use the SGPR ScalarOp.
6331 ScalarOp->setReg(SScalarOp);
6332 ScalarOp->setIsKill();
6333 }
6334 }
6335
6336 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6337 MRI.setSimpleHint(SaveExec, CondReg);
6338
6339 // Update EXEC to matching lanes, saving original to SaveExec.
6340 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6341 .addReg(CondReg, RegState::Kill);
6342
6343 // The original instruction is here; we insert the terminators after it.
6344 I = BodyBB.end();
6345
6346 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6347 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6348 .addReg(Exec)
6349 .addReg(SaveExec);
6350
6351 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6352}
6353
6354// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6355// with SGPRs by iterating over all unique values across all lanes.
6356// Returns the loop basic block that now contains \p MI.
6357static MachineBasicBlock *
6361 MachineBasicBlock::iterator Begin = nullptr,
6362 MachineBasicBlock::iterator End = nullptr) {
6363 MachineBasicBlock &MBB = *MI.getParent();
6364 MachineFunction &MF = *MBB.getParent();
6365 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6366 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6368 if (!Begin.isValid())
6369 Begin = &MI;
6370 if (!End.isValid()) {
6371 End = &MI;
6372 ++End;
6373 }
6374 const DebugLoc &DL = MI.getDebugLoc();
6375 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6376 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6377 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6378
6379 // Save SCC. Waterfall Loop may overwrite SCC.
6380 Register SaveSCCReg;
6381 bool SCCNotDead = (MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI, 30) !=
6383 if (SCCNotDead) {
6384 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6385 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6386 .addImm(1)
6387 .addImm(0);
6388 }
6389
6390 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6391
6392 // Save the EXEC mask
6393 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6394
6395 // Killed uses in the instruction we are waterfalling around will be
6396 // incorrect due to the added control-flow.
6398 ++AfterMI;
6399 for (auto I = Begin; I != AfterMI; I++) {
6400 for (auto &MO : I->all_uses())
6401 MRI.clearKillFlags(MO.getReg());
6402 }
6403
6404 // To insert the loop we need to split the block. Move everything after this
6405 // point to a new block, and insert a new empty block between the two.
6408 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6410 ++MBBI;
6411
6412 MF.insert(MBBI, LoopBB);
6413 MF.insert(MBBI, BodyBB);
6414 MF.insert(MBBI, RemainderBB);
6415
6416 LoopBB->addSuccessor(BodyBB);
6417 BodyBB->addSuccessor(LoopBB);
6418 BodyBB->addSuccessor(RemainderBB);
6419
6420 // Move Begin to MI to the BodyBB, and the remainder of the block to
6421 // RemainderBB.
6422 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6423 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6424 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6425
6426 MBB.addSuccessor(LoopBB);
6427
6428 // Update dominators. We know that MBB immediately dominates LoopBB, that
6429 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6430 // RemainderBB. RemainderBB immediately dominates all of the successors
6431 // transferred to it from MBB that MBB used to properly dominate.
6432 if (MDT) {
6433 MDT->addNewBlock(LoopBB, &MBB);
6434 MDT->addNewBlock(BodyBB, LoopBB);
6435 MDT->addNewBlock(RemainderBB, BodyBB);
6436 for (auto &Succ : RemainderBB->successors()) {
6437 if (MDT->properlyDominates(&MBB, Succ)) {
6438 MDT->changeImmediateDominator(Succ, RemainderBB);
6439 }
6440 }
6441 }
6442
6443 emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps);
6444
6445 MachineBasicBlock::iterator First = RemainderBB->begin();
6446 // Restore SCC
6447 if (SCCNotDead) {
6448 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6449 .addReg(SaveSCCReg, RegState::Kill)
6450 .addImm(0);
6451 }
6452
6453 // Restore the EXEC mask
6454 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6455 return BodyBB;
6456}
6457
6458// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6459static std::tuple<unsigned, unsigned>
6461 MachineBasicBlock &MBB = *MI.getParent();
6462 MachineFunction &MF = *MBB.getParent();
6464
6465 // Extract the ptr from the resource descriptor.
6466 unsigned RsrcPtr =
6467 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6468 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6469
6470 // Create an empty resource descriptor
6471 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6472 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6473 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6474 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6475 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6476
6477 // Zero64 = 0
6478 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6479 .addImm(0);
6480
6481 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6482 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6483 .addImm(RsrcDataFormat & 0xFFFFFFFF);
6484
6485 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6486 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6487 .addImm(RsrcDataFormat >> 32);
6488
6489 // NewSRsrc = {Zero64, SRsrcFormat}
6490 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6491 .addReg(Zero64)
6492 .addImm(AMDGPU::sub0_sub1)
6493 .addReg(SRsrcFormatLo)
6494 .addImm(AMDGPU::sub2)
6495 .addReg(SRsrcFormatHi)
6496 .addImm(AMDGPU::sub3);
6497
6498 return std::tuple(RsrcPtr, NewSRsrc);
6499}
6500
6503 MachineDominatorTree *MDT) const {
6504 MachineFunction &MF = *MI.getParent()->getParent();
6506 MachineBasicBlock *CreatedBB = nullptr;
6507
6508 // Legalize VOP2
6509 if (isVOP2(MI) || isVOPC(MI)) {
6511 return CreatedBB;
6512 }
6513
6514 // Legalize VOP3
6515 if (isVOP3(MI)) {
6517 return CreatedBB;
6518 }
6519
6520 // Legalize SMRD
6521 if (isSMRD(MI)) {
6523 return CreatedBB;
6524 }
6525
6526 // Legalize FLAT
6527 if (isFLAT(MI)) {
6529 return CreatedBB;
6530 }
6531
6532 // Legalize REG_SEQUENCE and PHI
6533 // The register class of the operands much be the same type as the register
6534 // class of the output.
6535 if (MI.getOpcode() == AMDGPU::PHI) {
6536 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6537 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6538 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6539 continue;
6540 const TargetRegisterClass *OpRC =
6541 MRI.getRegClass(MI.getOperand(i).getReg());
6542 if (RI.hasVectorRegisters(OpRC)) {
6543 VRC = OpRC;
6544 } else {
6545 SRC = OpRC;
6546 }
6547 }
6548
6549 // If any of the operands are VGPR registers, then they all most be
6550 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6551 // them.
6552 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6553 if (!VRC) {
6554 assert(SRC);
6555 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6556 VRC = &AMDGPU::VReg_1RegClass;
6557 } else
6558 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6559 ? RI.getEquivalentAGPRClass(SRC)
6560 : RI.getEquivalentVGPRClass(SRC);
6561 } else {
6562 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6563 ? RI.getEquivalentAGPRClass(VRC)
6564 : RI.getEquivalentVGPRClass(VRC);
6565 }
6566 RC = VRC;
6567 } else {
6568 RC = SRC;
6569 }
6570
6571 // Update all the operands so they have the same type.
6572 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6573 MachineOperand &Op = MI.getOperand(I);
6574 if (!Op.isReg() || !Op.getReg().isVirtual())
6575 continue;
6576
6577 // MI is a PHI instruction.
6578 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6580
6581 // Avoid creating no-op copies with the same src and dst reg class. These
6582 // confuse some of the machine passes.
6583 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6584 }
6585 }
6586
6587 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6588 // VGPR dest type and SGPR sources, insert copies so all operands are
6589 // VGPRs. This seems to help operand folding / the register coalescer.
6590 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6591 MachineBasicBlock *MBB = MI.getParent();
6592 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6593 if (RI.hasVGPRs(DstRC)) {
6594 // Update all the operands so they are VGPR register classes. These may
6595 // not be the same register class because REG_SEQUENCE supports mixing
6596 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6597 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6598 MachineOperand &Op = MI.getOperand(I);
6599 if (!Op.isReg() || !Op.getReg().isVirtual())
6600 continue;
6601
6602 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6603 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6604 if (VRC == OpRC)
6605 continue;
6606
6607 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6608 Op.setIsKill();
6609 }
6610 }
6611
6612 return CreatedBB;
6613 }
6614
6615 // Legalize INSERT_SUBREG
6616 // src0 must have the same register class as dst
6617 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6618 Register Dst = MI.getOperand(0).getReg();
6619 Register Src0 = MI.getOperand(1).getReg();
6620 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6621 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6622 if (DstRC != Src0RC) {
6623 MachineBasicBlock *MBB = MI.getParent();
6624 MachineOperand &Op = MI.getOperand(1);
6625 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6626 }
6627 return CreatedBB;
6628 }
6629
6630 // Legalize SI_INIT_M0
6631 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6632 MachineOperand &Src = MI.getOperand(0);
6633 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6634 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6635 return CreatedBB;
6636 }
6637
6638 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6639 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6640 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6641 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6642 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6643 MI.getOpcode() == AMDGPU::S_WQM_B64) {
6644 MachineOperand &Src = MI.getOperand(1);
6645 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6646 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6647 return CreatedBB;
6648 }
6649
6650 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6651 //
6652 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6653 // scratch memory access. In both cases, the legalization never involves
6654 // conversion to the addr64 form.
6656 (isMUBUF(MI) || isMTBUF(MI)))) {
6657 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6658 : AMDGPU::OpName::srsrc;
6659 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6660 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6661 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6662
6663 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6664 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6665 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6666 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6667
6668 return CreatedBB;
6669 }
6670
6671 // Legalize SI_CALL
6672 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6673 MachineOperand *Dest = &MI.getOperand(0);
6674 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6675 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6676 // following copies, we also need to move copies from and to physical
6677 // registers into the loop block.
6678 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6679 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6680
6681 // Also move the copies to physical registers into the loop block
6682 MachineBasicBlock &MBB = *MI.getParent();
6684 while (Start->getOpcode() != FrameSetupOpcode)
6685 --Start;
6687 while (End->getOpcode() != FrameDestroyOpcode)
6688 ++End;
6689 // Also include following copies of the return value
6690 ++End;
6691 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6692 MI.definesRegister(End->getOperand(1).getReg()))
6693 ++End;
6694 CreatedBB =
6695 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6696 }
6697 }
6698
6699 // Legalize s_sleep_var.
6700 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6701 const DebugLoc &DL = MI.getDebugLoc();
6702 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6703 int Src0Idx =
6704 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6705 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6706 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6707 .add(Src0);
6708 Src0.ChangeToRegister(Reg, false);
6709 return nullptr;
6710 }
6711
6712 // Legalize MUBUF instructions.
6713 bool isSoffsetLegal = true;
6714 int SoffsetIdx =
6715 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6716 if (SoffsetIdx != -1) {
6717 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6718 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6719 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6720 isSoffsetLegal = false;
6721 }
6722 }
6723
6724 bool isRsrcLegal = true;
6725 int RsrcIdx =
6726 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6727 if (RsrcIdx != -1) {
6728 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6729 if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
6730 isRsrcLegal = false;
6731 }
6732 }
6733
6734 // The operands are legal.
6735 if (isRsrcLegal && isSoffsetLegal)
6736 return CreatedBB;
6737
6738 if (!isRsrcLegal) {
6739 // Legalize a VGPR Rsrc
6740 //
6741 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6742 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6743 // a zero-value SRsrc.
6744 //
6745 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6746 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6747 // above.
6748 //
6749 // Otherwise we are on non-ADDR64 hardware, and/or we have
6750 // idxen/offen/bothen and we fall back to a waterfall loop.
6751
6752 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6753 MachineBasicBlock &MBB = *MI.getParent();
6754
6755 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6756 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6757 // This is already an ADDR64 instruction so we need to add the pointer
6758 // extracted from the resource descriptor to the current value of VAddr.
6759 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6760 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6761 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6762
6763 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6764 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
6765 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
6766
6767 unsigned RsrcPtr, NewSRsrc;
6768 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6769
6770 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
6771 const DebugLoc &DL = MI.getDebugLoc();
6772 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
6773 .addDef(CondReg0)
6774 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6775 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
6776 .addImm(0);
6777
6778 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
6779 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
6780 .addDef(CondReg1, RegState::Dead)
6781 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6782 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
6783 .addReg(CondReg0, RegState::Kill)
6784 .addImm(0);
6785
6786 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6787 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
6788 .addReg(NewVAddrLo)
6789 .addImm(AMDGPU::sub0)
6790 .addReg(NewVAddrHi)
6791 .addImm(AMDGPU::sub1);
6792
6793 VAddr->setReg(NewVAddr);
6794 Rsrc->setReg(NewSRsrc);
6795 } else if (!VAddr && ST.hasAddr64()) {
6796 // This instructions is the _OFFSET variant, so we need to convert it to
6797 // ADDR64.
6799 "FIXME: Need to emit flat atomics here");
6800
6801 unsigned RsrcPtr, NewSRsrc;
6802 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6803
6804 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6805 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
6806 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
6807 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6808 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
6809
6810 // Atomics with return have an additional tied operand and are
6811 // missing some of the special bits.
6812 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
6813 MachineInstr *Addr64;
6814
6815 if (!VDataIn) {
6816 // Regular buffer load / store.
6818 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6819 .add(*VData)
6820 .addReg(NewVAddr)
6821 .addReg(NewSRsrc)
6822 .add(*SOffset)
6823 .add(*Offset);
6824
6825 if (const MachineOperand *CPol =
6826 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6827 MIB.addImm(CPol->getImm());
6828 }
6829
6830 if (const MachineOperand *TFE =
6831 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
6832 MIB.addImm(TFE->getImm());
6833 }
6834
6835 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
6836
6837 MIB.cloneMemRefs(MI);
6838 Addr64 = MIB;
6839 } else {
6840 // Atomics with return.
6841 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6842 .add(*VData)
6843 .add(*VDataIn)
6844 .addReg(NewVAddr)
6845 .addReg(NewSRsrc)
6846 .add(*SOffset)
6847 .add(*Offset)
6848 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
6849 .cloneMemRefs(MI);
6850 }
6851
6852 MI.removeFromParent();
6853
6854 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6855 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
6856 NewVAddr)
6857 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6858 .addImm(AMDGPU::sub0)
6859 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6860 .addImm(AMDGPU::sub1);
6861 } else {
6862 // Legalize a VGPR Rsrc and soffset together.
6863 if (!isSoffsetLegal) {
6864 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6865 CreatedBB =
6866 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
6867 return CreatedBB;
6868 }
6869 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
6870 return CreatedBB;
6871 }
6872 }
6873
6874 // Legalize a VGPR soffset.
6875 if (!isSoffsetLegal) {
6876 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6877 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
6878 return CreatedBB;
6879 }
6880 return CreatedBB;
6881}
6882
6884 InstrList.insert(MI);
6885 // Add MBUF instructiosn to deferred list.
6886 int RsrcIdx =
6887 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
6888 if (RsrcIdx != -1) {
6889 DeferredList.insert(MI);
6890 }
6891}
6892
6894 return DeferredList.contains(MI);
6895}
6896
6898 MachineDominatorTree *MDT) const {
6899
6900 while (!Worklist.empty()) {
6901 MachineInstr &Inst = *Worklist.top();
6902 Worklist.erase_top();
6903 // Skip MachineInstr in the deferred list.
6904 if (Worklist.isDeferred(&Inst))
6905 continue;
6906 moveToVALUImpl(Worklist, MDT, Inst);
6907 }
6908
6909 // Deferred list of instructions will be processed once
6910 // all the MachineInstr in the worklist are done.
6911 for (MachineInstr *Inst : Worklist.getDeferredList()) {
6912 moveToVALUImpl(Worklist, MDT, *Inst);
6913 assert(Worklist.empty() &&
6914 "Deferred MachineInstr are not supposed to re-populate worklist");
6915 }
6916}
6917
6920 MachineInstr &Inst) const {
6921
6923 if (!MBB)
6924 return;
6926 unsigned Opcode = Inst.getOpcode();
6927 unsigned NewOpcode = getVALUOp(Inst);
6928 // Handle some special cases
6929 switch (Opcode) {
6930 default:
6931 break;
6932 case AMDGPU::S_ADD_U64_PSEUDO:
6933 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
6934 break;
6935 case AMDGPU::S_SUB_U64_PSEUDO:
6936 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
6937 break;
6938 case AMDGPU::S_ADD_I32:
6939 case AMDGPU::S_SUB_I32: {
6940 // FIXME: The u32 versions currently selected use the carry.
6941 bool Changed;
6942 MachineBasicBlock *CreatedBBTmp = nullptr;
6943 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
6944 if (Changed)
6945 return;
6946
6947 // Default handling
6948 break;
6949 }
6950
6951 case AMDGPU::S_MUL_U64:
6952 // Split s_mul_u64 in 32-bit vector multiplications.
6953 splitScalarSMulU64(Worklist, Inst, MDT);
6954 Inst.eraseFromParent();
6955 return;
6956
6957 case AMDGPU::S_MUL_U64_U32_PSEUDO:
6958 case AMDGPU::S_MUL_I64_I32_PSEUDO:
6959 // This is a special case of s_mul_u64 where all the operands are either
6960 // zero extended or sign extended.
6961 splitScalarSMulPseudo(Worklist, Inst, MDT);
6962 Inst.eraseFromParent();
6963 return;
6964
6965 case AMDGPU::S_AND_B64:
6966 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
6967 Inst.eraseFromParent();
6968 return;
6969
6970 case AMDGPU::S_OR_B64:
6971 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
6972 Inst.eraseFromParent();
6973 return;
6974
6975 case AMDGPU::S_XOR_B64:
6976 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
6977 Inst.eraseFromParent();
6978 return;
6979
6980 case AMDGPU::S_NAND_B64:
6981 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
6982 Inst.eraseFromParent();
6983 return;
6984
6985 case AMDGPU::S_NOR_B64:
6986 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
6987 Inst.eraseFromParent();
6988 return;
6989
6990 case AMDGPU::S_XNOR_B64:
6991 if (ST.hasDLInsts())
6992 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
6993 else
6994 splitScalar64BitXnor(Worklist, Inst, MDT);
6995 Inst.eraseFromParent();
6996 return;
6997
6998 case AMDGPU::S_ANDN2_B64:
6999 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7000 Inst.eraseFromParent();
7001 return;
7002
7003 case AMDGPU::S_ORN2_B64:
7004 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7005 Inst.eraseFromParent();
7006 return;
7007
7008 case AMDGPU::S_BREV_B64:
7009 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7010 Inst.eraseFromParent();
7011 return;
7012
7013 case AMDGPU::S_NOT_B64:
7014 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7015 Inst.eraseFromParent();
7016 return;
7017
7018 case AMDGPU::S_BCNT1_I32_B64:
7019 splitScalar64BitBCNT(Worklist, Inst);
7020 Inst.eraseFromParent();
7021 return;
7022
7023 case AMDGPU::S_BFE_I64:
7024 splitScalar64BitBFE(Worklist, Inst);
7025 Inst.eraseFromParent();
7026 return;
7027
7028 case AMDGPU::S_FLBIT_I32_B64:
7029 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7030 Inst.eraseFromParent();
7031 return;
7032 case AMDGPU::S_FF1_I32_B64:
7033 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7034 Inst.eraseFromParent();
7035 return;
7036
7037 case AMDGPU::S_LSHL_B32:
7038 if (ST.hasOnlyRevVALUShifts()) {
7039 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7040 swapOperands(Inst);
7041 }
7042 break;
7043 case AMDGPU::S_ASHR_I32:
7044 if (ST.hasOnlyRevVALUShifts()) {
7045 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7046 swapOperands(Inst);
7047 }
7048 break;
7049 case AMDGPU::S_LSHR_B32:
7050 if (ST.hasOnlyRevVALUShifts()) {
7051 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7052 swapOperands(Inst);
7053 }
7054 break;
7055 case AMDGPU::S_LSHL_B64:
7056 if (ST.hasOnlyRevVALUShifts()) {
7057 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7058 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7059 : AMDGPU::V_LSHLREV_B64_e64;
7060 swapOperands(Inst);
7061 }
7062 break;
7063 case AMDGPU::S_ASHR_I64:
7064 if (ST.hasOnlyRevVALUShifts()) {
7065 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7066 swapOperands(Inst);
7067 }
7068 break;
7069 case AMDGPU::S_LSHR_B64:
7070 if (ST.hasOnlyRevVALUShifts()) {
7071 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7072 swapOperands(Inst);
7073 }
7074 break;
7075
7076 case AMDGPU::S_ABS_I32:
7077 lowerScalarAbs(Worklist, Inst);
7078 Inst.eraseFromParent();
7079 return;
7080
7081 case AMDGPU::S_CBRANCH_SCC0:
7082 case AMDGPU::S_CBRANCH_SCC1: {
7083 // Clear unused bits of vcc
7084 Register CondReg = Inst.getOperand(1).getReg();
7085 bool IsSCC = CondReg == AMDGPU::SCC;
7086 Register VCC = RI.getVCC();
7087 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7088 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7089 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7090 .addReg(EXEC)
7091 .addReg(IsSCC ? VCC : CondReg);
7092 Inst.removeOperand(1);
7093 } break;
7094
7095 case AMDGPU::S_BFE_U64:
7096 case AMDGPU::S_BFM_B64:
7097 llvm_unreachable("Moving this op to VALU not implemented");
7098
7099 case AMDGPU::S_PACK_LL_B32_B16:
7100 case AMDGPU::S_PACK_LH_B32_B16:
7101 case AMDGPU::S_PACK_HL_B32_B16:
7102 case AMDGPU::S_PACK_HH_B32_B16:
7103 movePackToVALU(Worklist, MRI, Inst);
7104 Inst.eraseFromParent();
7105 return;
7106
7107 case AMDGPU::S_XNOR_B32:
7108 lowerScalarXnor(Worklist, Inst);
7109 Inst.eraseFromParent();
7110 return;
7111
7112 case AMDGPU::S_NAND_B32:
7113 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7114 Inst.eraseFromParent();
7115 return;
7116
7117 case AMDGPU::S_NOR_B32:
7118 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7119 Inst.eraseFromParent();
7120 return;
7121
7122 case AMDGPU::S_ANDN2_B32:
7123 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7124 Inst.eraseFromParent();
7125 return;
7126
7127 case AMDGPU::S_ORN2_B32:
7128 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7129 Inst.eraseFromParent();
7130 return;
7131
7132 // TODO: remove as soon as everything is ready
7133 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7134 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7135 // can only be selected from the uniform SDNode.
7136 case AMDGPU::S_ADD_CO_PSEUDO:
7137 case AMDGPU::S_SUB_CO_PSEUDO: {
7138 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7139 ? AMDGPU::V_ADDC_U32_e64
7140 : AMDGPU::V_SUBB_U32_e64;
7141 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7142
7143 Register CarryInReg = Inst.getOperand(4).getReg();
7144 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7145 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7146 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7147 .addReg(CarryInReg);
7148 }
7149
7150 Register CarryOutReg = Inst.getOperand(1).getReg();
7151
7152 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7153 MRI.getRegClass(Inst.getOperand(0).getReg())));
7154 MachineInstr *CarryOp =
7155 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7156 .addReg(CarryOutReg, RegState::Define)
7157 .add(Inst.getOperand(2))
7158 .add(Inst.getOperand(3))
7159 .addReg(CarryInReg)
7160 .addImm(0);
7161 legalizeOperands(*CarryOp);
7162 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7163 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7164 Inst.eraseFromParent();
7165 }
7166 return;
7167 case AMDGPU::S_UADDO_PSEUDO:
7168 case AMDGPU::S_USUBO_PSEUDO: {
7169 const DebugLoc &DL = Inst.getDebugLoc();
7170 MachineOperand &Dest0 = Inst.getOperand(0);
7171 MachineOperand &Dest1 = Inst.getOperand(1);
7172 MachineOperand &Src0 = Inst.getOperand(2);
7173 MachineOperand &Src1 = Inst.getOperand(3);
7174
7175 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7176 ? AMDGPU::V_ADD_CO_U32_e64
7177 : AMDGPU::V_SUB_CO_U32_e64;
7178 const TargetRegisterClass *NewRC =
7179 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7180 Register DestReg = MRI.createVirtualRegister(NewRC);
7181 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7182 .addReg(Dest1.getReg(), RegState::Define)
7183 .add(Src0)
7184 .add(Src1)
7185 .addImm(0); // clamp bit
7186
7187 legalizeOperands(*NewInstr, MDT);
7188 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7189 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7190 Worklist);
7191 Inst.eraseFromParent();
7192 }
7193 return;
7194
7195 case AMDGPU::S_CSELECT_B32:
7196 case AMDGPU::S_CSELECT_B64:
7197 lowerSelect(Worklist, Inst, MDT);
7198 Inst.eraseFromParent();
7199 return;
7200 case AMDGPU::S_CMP_EQ_I32:
7201 case AMDGPU::S_CMP_LG_I32:
7202 case AMDGPU::S_CMP_GT_I32:
7203 case AMDGPU::S_CMP_GE_I32:
7204 case AMDGPU::S_CMP_LT_I32:
7205 case AMDGPU::S_CMP_LE_I32:
7206 case AMDGPU::S_CMP_EQ_U32:
7207 case AMDGPU::S_CMP_LG_U32:
7208 case AMDGPU::S_CMP_GT_U32:
7209 case AMDGPU::S_CMP_GE_U32:
7210 case AMDGPU::S_CMP_LT_U32:
7211 case AMDGPU::S_CMP_LE_U32:
7212 case AMDGPU::S_CMP_EQ_U64:
7213 case AMDGPU::S_CMP_LG_U64:
7214 case AMDGPU::S_CMP_LT_F32:
7215 case AMDGPU::S_CMP_EQ_F32:
7216 case AMDGPU::S_CMP_LE_F32:
7217 case AMDGPU::S_CMP_GT_F32:
7218 case AMDGPU::S_CMP_LG_F32:
7219 case AMDGPU::S_CMP_GE_F32:
7220 case AMDGPU::S_CMP_O_F32:
7221 case AMDGPU::S_CMP_U_F32:
7222 case AMDGPU::S_CMP_NGE_F32:
7223 case AMDGPU::S_CMP_NLG_F32:
7224 case AMDGPU::S_CMP_NGT_F32:
7225 case AMDGPU::S_CMP_NLE_F32:
7226 case AMDGPU::S_CMP_NEQ_F32:
7227 case AMDGPU::S_CMP_NLT_F32:
7228 case AMDGPU::S_CMP_LT_F16:
7229 case AMDGPU::S_CMP_EQ_F16:
7230 case AMDGPU::S_CMP_LE_F16:
7231 case AMDGPU::S_CMP_GT_F16:
7232 case AMDGPU::S_CMP_LG_F16:
7233 case AMDGPU::S_CMP_GE_F16:
7234 case AMDGPU::S_CMP_O_F16:
7235 case AMDGPU::S_CMP_U_F16:
7236 case AMDGPU::S_CMP_NGE_F16:
7237 case AMDGPU::S_CMP_NLG_F16:
7238 case AMDGPU::S_CMP_NGT_F16:
7239 case AMDGPU::S_CMP_NLE_F16:
7240 case AMDGPU::S_CMP_NEQ_F16:
7241 case AMDGPU::S_CMP_NLT_F16: {
7242 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7243 auto NewInstr =
7244 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7245 .setMIFlags(Inst.getFlags());
7246 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7247 AMDGPU::OpName::src0_modifiers) >= 0) {
7248 NewInstr
7249 .addImm(0) // src0_modifiers
7250 .add(Inst.getOperand(0)) // src0
7251 .addImm(0) // src1_modifiers
7252 .add(Inst.getOperand(1)) // src1
7253 .addImm(0); // clamp
7254 } else {
7255 NewInstr
7256 .add(Inst.getOperand(0))
7257 .add(Inst.getOperand(1));
7258 }
7259 legalizeOperands(*NewInstr, MDT);
7260 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC);
7261 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7262 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7263 Inst.eraseFromParent();
7264 return;
7265 }
7266 case AMDGPU::S_CVT_HI_F32_F16: {
7267 const DebugLoc &DL = Inst.getDebugLoc();
7268 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7269 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7270 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7271 .addImm(16)
7272 .add(Inst.getOperand(1));
7273 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7274 .addImm(0) // src0_modifiers
7275 .addReg(TmpReg)
7276 .addImm(0) // clamp
7277 .addImm(0); // omod
7278
7279 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7280 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7281 Inst.eraseFromParent();
7282 return;
7283 }
7284 case AMDGPU::S_MINIMUM_F32:
7285 case AMDGPU::S_MAXIMUM_F32:
7286 case AMDGPU::S_MINIMUM_F16:
7287 case AMDGPU::S_MAXIMUM_F16: {
7288 const DebugLoc &DL = Inst.getDebugLoc();
7289 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7290 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7291 .addImm(0) // src0_modifiers
7292 .add(Inst.getOperand(1))
7293 .addImm(0) // src1_modifiers
7294 .add(Inst.getOperand(2))
7295 .addImm(0) // clamp
7296 .addImm(0); // omod
7297 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7298
7299 legalizeOperands(*NewInstr, MDT);
7300 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7301 Inst.eraseFromParent();
7302 return;
7303 }
7304 }
7305
7306 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7307 // We cannot move this instruction to the VALU, so we should try to
7308 // legalize its operands instead.
7309 legalizeOperands(Inst, MDT);
7310 return;
7311 }
7312 // Handle converting generic instructions like COPY-to-SGPR into
7313 // COPY-to-VGPR.
7314 if (NewOpcode == Opcode) {
7315 Register DstReg = Inst.getOperand(0).getReg();
7316 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7317
7318 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7319 // hope for the best.
7320 if (Inst.isCopy() && DstReg.isPhysical() &&
7321 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7322 // TODO: Only works for 32 bit registers.
7323 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7324 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7325 .add(Inst.getOperand(1));
7326 Inst.eraseFromParent();
7327 return;
7328 }
7329
7330 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7331 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7332 // Instead of creating a copy where src and dst are the same register
7333 // class, we just replace all uses of dst with src. These kinds of
7334 // copies interfere with the heuristics MachineSink uses to decide
7335 // whether or not to split a critical edge. Since the pass assumes
7336 // that copies will end up as machine instructions and not be
7337 // eliminated.
7338 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7339 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7340 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7341 Inst.getOperand(0).setReg(DstReg);
7342 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7343 // these are deleted later, but at -O0 it would leave a suspicious
7344 // looking illegal copy of an undef register.
7345 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7346 Inst.removeOperand(I);
7347 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7348 return;
7349 }
7350 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7351 MRI.replaceRegWith(DstReg, NewDstReg);
7352 legalizeOperands(Inst, MDT);
7353 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7354 return;
7355 }
7356
7357 // Use the new VALU Opcode.
7358 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7359 .setMIFlags(Inst.getFlags());
7360 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7361 // Intersperse VOP3 modifiers among the SALU operands.
7362 NewInstr->addOperand(Inst.getOperand(0));
7363 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7364 AMDGPU::OpName::src0_modifiers) >= 0)
7365 NewInstr.addImm(0);
7366 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7367 MachineOperand Src = Inst.getOperand(1);
7368 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7369 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7370 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7371 else
7372 NewInstr->addOperand(Src);
7373 }
7374
7375 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7376 // We are converting these to a BFE, so we need to add the missing
7377 // operands for the size and offset.
7378 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7379 NewInstr.addImm(0);
7380 NewInstr.addImm(Size);
7381 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7382 // The VALU version adds the second operand to the result, so insert an
7383 // extra 0 operand.
7384 NewInstr.addImm(0);
7385 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7386 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7387 // If we need to move this to VGPRs, we need to unpack the second
7388 // operand back into the 2 separate ones for bit offset and width.
7389 assert(OffsetWidthOp.isImm() &&
7390 "Scalar BFE is only implemented for constant width and offset");
7391 uint32_t Imm = OffsetWidthOp.getImm();
7392
7393 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7394 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7395 NewInstr.addImm(Offset);
7396 NewInstr.addImm(BitWidth);
7397 } else {
7398 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7399 AMDGPU::OpName::src1_modifiers) >= 0)
7400 NewInstr.addImm(0);
7401 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7402 NewInstr->addOperand(Inst.getOperand(2));
7403 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7404 AMDGPU::OpName::src2_modifiers) >= 0)
7405 NewInstr.addImm(0);
7406 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7407 NewInstr->addOperand(Inst.getOperand(3));
7408 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7409 NewInstr.addImm(0);
7410 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7411 NewInstr.addImm(0);
7412 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7413 NewInstr.addImm(0);
7414 }
7415 } else {
7416 // Just copy the SALU operands.
7417 for (const MachineOperand &Op : Inst.explicit_operands())
7418 NewInstr->addOperand(Op);
7419 }
7420
7421 // Remove any references to SCC. Vector instructions can't read from it, and
7422 // We're just about to add the implicit use / defs of VCC, and we don't want
7423 // both.
7424 for (MachineOperand &Op : Inst.implicit_operands()) {
7425 if (Op.getReg() == AMDGPU::SCC) {
7426 // Only propagate through live-def of SCC.
7427 if (Op.isDef() && !Op.isDead())
7428 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7429 if (Op.isUse())
7430 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7431 }
7432 }
7433 Inst.eraseFromParent();
7434 Register NewDstReg;
7435 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7436 Register DstReg = NewInstr->getOperand(0).getReg();
7437 assert(DstReg.isVirtual());
7438 // Update the destination register class.
7439 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7440 assert(NewDstRC);
7441 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7442 MRI.replaceRegWith(DstReg, NewDstReg);
7443 }
7444 fixImplicitOperands(*NewInstr);
7445 // Legalize the operands
7446 legalizeOperands(*NewInstr, MDT);
7447 if (NewDstReg)
7448 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7449}
7450
7451// Add/sub require special handling to deal with carry outs.
7452std::pair<bool, MachineBasicBlock *>
7453SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7454 MachineDominatorTree *MDT) const {
7455 if (ST.hasAddNoCarry()) {
7456 // Assume there is no user of scc since we don't select this in that case.
7457 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7458 // is used.
7459
7460 MachineBasicBlock &MBB = *Inst.getParent();
7462
7463 Register OldDstReg = Inst.getOperand(0).getReg();
7464 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7465
7466 unsigned Opc = Inst.getOpcode();
7467 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7468
7469 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7470 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7471
7472 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7473 Inst.removeOperand(3);
7474
7475 Inst.setDesc(get(NewOpc));
7476 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7478 MRI.replaceRegWith(OldDstReg, ResultReg);
7479 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7480
7481 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7482 return std::pair(true, NewBB);
7483 }
7484
7485 return std::pair(false, nullptr);
7486}
7487
7488void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7489 MachineDominatorTree *MDT) const {
7490
7491 MachineBasicBlock &MBB = *Inst.getParent();
7493 MachineBasicBlock::iterator MII = Inst;
7494 DebugLoc DL = Inst.getDebugLoc();
7495
7496 MachineOperand &Dest = Inst.getOperand(0);
7497 MachineOperand &Src0 = Inst.getOperand(1);
7498 MachineOperand &Src1 = Inst.getOperand(2);
7499 MachineOperand &Cond = Inst.getOperand(3);
7500
7501 Register CondReg = Cond.getReg();
7502 bool IsSCC = (CondReg == AMDGPU::SCC);
7503
7504 // If this is a trivial select where the condition is effectively not SCC
7505 // (CondReg is a source of copy to SCC), then the select is semantically
7506 // equivalent to copying CondReg. Hence, there is no need to create
7507 // V_CNDMASK, we can just use that and bail out.
7508 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7509 (Src1.getImm() == 0)) {
7510 MRI.replaceRegWith(Dest.getReg(), CondReg);
7511 return;
7512 }
7513
7514 Register NewCondReg = CondReg;
7515 if (IsSCC) {
7516 const TargetRegisterClass *TC =
7517 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7518 NewCondReg = MRI.createVirtualRegister(TC);
7519
7520 // Now look for the closest SCC def if it is a copy
7521 // replacing the CondReg with the COPY source register
7522 bool CopyFound = false;
7523 for (MachineInstr &CandI :
7525 Inst.getParent()->rend())) {
7526 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
7527 -1) {
7528 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7529 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7530 .addReg(CandI.getOperand(1).getReg());
7531 CopyFound = true;
7532 }
7533 break;
7534 }
7535 }
7536 if (!CopyFound) {
7537 // SCC def is not a copy
7538 // Insert a trivial select instead of creating a copy, because a copy from
7539 // SCC would semantically mean just copying a single bit, but we may need
7540 // the result to be a vector condition mask that needs preserving.
7541 unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
7542 : AMDGPU::S_CSELECT_B32;
7543 auto NewSelect =
7544 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7545 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7546 }
7547 }
7548
7549 Register NewDestReg = MRI.createVirtualRegister(
7550 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7551 MachineInstr *NewInst;
7552 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7553 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7554 .addImm(0)
7555 .add(Src1) // False
7556 .addImm(0)
7557 .add(Src0) // True
7558 .addReg(NewCondReg);
7559 } else {
7560 NewInst =
7561 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7562 .add(Src1) // False
7563 .add(Src0) // True
7564 .addReg(NewCondReg);
7565 }
7566 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7567 legalizeOperands(*NewInst, MDT);
7568 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7569}
7570
7571void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7572 MachineInstr &Inst) const {
7573 MachineBasicBlock &MBB = *Inst.getParent();
7575 MachineBasicBlock::iterator MII = Inst;
7576 DebugLoc DL = Inst.getDebugLoc();
7577
7578 MachineOperand &Dest = Inst.getOperand(0);
7579 MachineOperand &Src = Inst.getOperand(1);
7580 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7581 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7582
7583 unsigned SubOp = ST.hasAddNoCarry() ?
7584 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7585
7586 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7587 .addImm(0)
7588 .addReg(Src.getReg());
7589
7590 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7591 .addReg(Src.getReg())
7592 .addReg(TmpReg);
7593
7594 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7595 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7596}
7597
7598void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7599 MachineInstr &Inst) const {
7600 MachineBasicBlock &MBB = *Inst.getParent();
7602 MachineBasicBlock::iterator MII = Inst;
7603 const DebugLoc &DL = Inst.getDebugLoc();
7604
7605 MachineOperand &Dest = Inst.getOperand(0);
7606 MachineOperand &Src0 = Inst.getOperand(1);
7607 MachineOperand &Src1 = Inst.getOperand(2);
7608
7609 if (ST.hasDLInsts()) {
7610 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7611 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7612 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7613
7614 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7615 .add(Src0)
7616 .add(Src1);
7617
7618 MRI.replaceRegWith(Dest.getReg(), NewDest);
7619 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7620 } else {
7621 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7622 // invert either source and then perform the XOR. If either source is a
7623 // scalar register, then we can leave the inversion on the scalar unit to
7624 // achieve a better distribution of scalar and vector instructions.
7625 bool Src0IsSGPR = Src0.isReg() &&
7626 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7627 bool Src1IsSGPR = Src1.isReg() &&
7628 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7630 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7631 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7632
7633 // Build a pair of scalar instructions and add them to the work list.
7634 // The next iteration over the work list will lower these to the vector
7635 // unit as necessary.
7636 if (Src0IsSGPR) {
7637 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7638 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7639 .addReg(Temp)
7640 .add(Src1);
7641 } else if (Src1IsSGPR) {
7642 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7643 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7644 .add(Src0)
7645 .addReg(Temp);
7646 } else {
7647 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7648 .add(Src0)
7649 .add(Src1);
7650 MachineInstr *Not =
7651 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7652 Worklist.insert(Not);
7653 }
7654
7655 MRI.replaceRegWith(Dest.getReg(), NewDest);
7656
7657 Worklist.insert(Xor);
7658
7659 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7660 }
7661}
7662
7663void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7664 MachineInstr &Inst,
7665 unsigned Opcode) const {
7666 MachineBasicBlock &MBB = *Inst.getParent();
7668 MachineBasicBlock::iterator MII = Inst;
7669 const DebugLoc &DL = Inst.getDebugLoc();
7670
7671 MachineOperand &Dest = Inst.getOperand(0);
7672 MachineOperand &Src0 = Inst.getOperand(1);
7673 MachineOperand &Src1 = Inst.getOperand(2);
7674
7675 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7676 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7677
7678 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7679 .add(Src0)
7680 .add(Src1);
7681
7682 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7683 .addReg(Interm);
7684
7685 Worklist.insert(&Op);
7686 Worklist.insert(&Not);
7687
7688 MRI.replaceRegWith(Dest.getReg(), NewDest);
7689 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7690}
7691
7692void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7693 MachineInstr &Inst,
7694 unsigned Opcode) const {
7695 MachineBasicBlock &MBB = *Inst.getParent();
7697 MachineBasicBlock::iterator MII = Inst;
7698 const DebugLoc &DL = Inst.getDebugLoc();
7699
7700 MachineOperand &Dest = Inst.getOperand(0);
7701 MachineOperand &Src0 = Inst.getOperand(1);
7702 MachineOperand &Src1 = Inst.getOperand(2);
7703
7704 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7705 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7706
7707 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7708 .add(Src1);
7709
7710 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7711 .add(Src0)
7712 .addReg(Interm);
7713
7714 Worklist.insert(&Not);
7715 Worklist.insert(&Op);
7716
7717 MRI.replaceRegWith(Dest.getReg(), NewDest);
7718 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7719}
7720
7721void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
7722 MachineInstr &Inst, unsigned Opcode,
7723 bool Swap) const {
7724 MachineBasicBlock &MBB = *Inst.getParent();
7726
7727 MachineOperand &Dest = Inst.getOperand(0);
7728 MachineOperand &Src0 = Inst.getOperand(1);
7729 DebugLoc DL = Inst.getDebugLoc();
7730
7731 MachineBasicBlock::iterator MII = Inst;
7732
7733 const MCInstrDesc &InstDesc = get(Opcode);
7734 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7735 MRI.getRegClass(Src0.getReg()) :
7736 &AMDGPU::SGPR_32RegClass;
7737
7738 const TargetRegisterClass *Src0SubRC =
7739 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7740
7741 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7742 AMDGPU::sub0, Src0SubRC);
7743
7744 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7745 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7746 const TargetRegisterClass *NewDestSubRC =
7747 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7748
7749 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7750 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
7751
7752 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7753 AMDGPU::sub1, Src0SubRC);
7754
7755 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
7756 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
7757
7758 if (Swap)
7759 std::swap(DestSub0, DestSub1);
7760
7761 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
7762 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7763 .addReg(DestSub0)
7764 .addImm(AMDGPU::sub0)
7765 .addReg(DestSub1)
7766 .addImm(AMDGPU::sub1);
7767
7768 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7769
7770 Worklist.insert(&LoHalf);
7771 Worklist.insert(&HiHalf);
7772
7773 // We don't need to legalizeOperands here because for a single operand, src0
7774 // will support any kind of input.
7775
7776 // Move all users of this moved value.
7777 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7778}
7779
7780// There is not a vector equivalent of s_mul_u64. For this reason, we need to
7781// split the s_mul_u64 in 32-bit vector multiplications.
7782void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
7783 MachineInstr &Inst,
7784 MachineDominatorTree *MDT) const {
7785 MachineBasicBlock &MBB = *Inst.getParent();
7787
7788 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7789 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7790 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7791
7792 MachineOperand &Dest = Inst.getOperand(0);
7793 MachineOperand &Src0 = Inst.getOperand(1);
7794 MachineOperand &Src1 = Inst.getOperand(2);
7795 const DebugLoc &DL = Inst.getDebugLoc();
7796 MachineBasicBlock::iterator MII = Inst;
7797
7798 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7799 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7800 const TargetRegisterClass *Src0SubRC =
7801 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7802 if (RI.isSGPRClass(Src0SubRC))
7803 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7804 const TargetRegisterClass *Src1SubRC =
7805 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7806 if (RI.isSGPRClass(Src1SubRC))
7807 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7808
7809 // First, we extract the low 32-bit and high 32-bit values from each of the
7810 // operands.
7811 MachineOperand Op0L =
7812 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7813 MachineOperand Op1L =
7814 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7815 MachineOperand Op0H =
7816 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
7817 MachineOperand Op1H =
7818 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
7819
7820 // The multilication is done as follows:
7821 //
7822 // Op1H Op1L
7823 // * Op0H Op0L
7824 // --------------------
7825 // Op1H*Op0L Op1L*Op0L
7826 // + Op1H*Op0H Op1L*Op0H
7827 // -----------------------------------------
7828 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
7829 //
7830 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
7831 // value and that would overflow.
7832 // The low 32-bit value is Op1L*Op0L.
7833 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
7834
7835 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7836 MachineInstr *Op1L_Op0H =
7837 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
7838 .add(Op1L)
7839 .add(Op0H);
7840
7841 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7842 MachineInstr *Op1H_Op0L =
7843 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
7844 .add(Op1H)
7845 .add(Op0L);
7846
7847 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7848 MachineInstr *Carry =
7849 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
7850 .add(Op1L)
7851 .add(Op0L);
7852
7853 MachineInstr *LoHalf =
7854 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7855 .add(Op1L)
7856 .add(Op0L);
7857
7858 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7859 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
7860 .addReg(Op1L_Op0H_Reg)
7861 .addReg(Op1H_Op0L_Reg);
7862
7863 MachineInstr *HiHalf =
7864 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
7865 .addReg(AddReg)
7866 .addReg(CarryReg);
7867
7868 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7869 .addReg(DestSub0)
7870 .addImm(AMDGPU::sub0)
7871 .addReg(DestSub1)
7872 .addImm(AMDGPU::sub1);
7873
7874 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7875
7876 // Try to legalize the operands in case we need to swap the order to keep it
7877 // valid.
7878 legalizeOperands(*Op1L_Op0H, MDT);
7879 legalizeOperands(*Op1H_Op0L, MDT);
7880 legalizeOperands(*Carry, MDT);
7881 legalizeOperands(*LoHalf, MDT);
7882 legalizeOperands(*Add, MDT);
7883 legalizeOperands(*HiHalf, MDT);
7884
7885 // Move all users of this moved value.
7886 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7887}
7888
7889// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
7890// multiplications.
7891void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
7892 MachineInstr &Inst,
7893 MachineDominatorTree *MDT) const {
7894 MachineBasicBlock &MBB = *Inst.getParent();
7896
7897 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7898 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7899 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7900
7901 MachineOperand &Dest = Inst.getOperand(0);
7902 MachineOperand &Src0 = Inst.getOperand(1);
7903 MachineOperand &Src1 = Inst.getOperand(2);
7904 const DebugLoc &DL = Inst.getDebugLoc();
7905 MachineBasicBlock::iterator MII = Inst;
7906
7907 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7908 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7909 const TargetRegisterClass *Src0SubRC =
7910 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7911 if (RI.isSGPRClass(Src0SubRC))
7912 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7913 const TargetRegisterClass *Src1SubRC =
7914 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7915 if (RI.isSGPRClass(Src1SubRC))
7916 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7917
7918 // First, we extract the low 32-bit and high 32-bit values from each of the
7919 // operands.
7920 MachineOperand Op0L =
7921 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7922 MachineOperand Op1L =
7923 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7924
7925 unsigned Opc = Inst.getOpcode();
7926 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
7927 ? AMDGPU::V_MUL_HI_U32_e64
7928 : AMDGPU::V_MUL_HI_I32_e64;
7929 MachineInstr *HiHalf =
7930 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
7931
7932 MachineInstr *LoHalf =
7933 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7934 .add(Op1L)
7935 .add(Op0L);
7936
7937 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7938 .addReg(DestSub0)
7939 .addImm(AMDGPU::sub0)
7940 .addReg(DestSub1)
7941 .addImm(AMDGPU::sub1);
7942
7943 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7944
7945 // Try to legalize the operands in case we need to swap the order to keep it
7946 // valid.
7947 legalizeOperands(*HiHalf, MDT);
7948 legalizeOperands(*LoHalf, MDT);
7949
7950 // Move all users of this moved value.
7951 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7952}
7953
7954void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
7955 MachineInstr &Inst, unsigned Opcode,
7956 MachineDominatorTree *MDT) const {
7957 MachineBasicBlock &MBB = *Inst.getParent();
7959
7960 MachineOperand &Dest = Inst.getOperand(0);
7961 MachineOperand &Src0 = Inst.getOperand(1);
7962 MachineOperand &Src1 = Inst.getOperand(2);
7963 DebugLoc DL = Inst.getDebugLoc();
7964
7965 MachineBasicBlock::iterator MII = Inst;
7966
7967 const MCInstrDesc &InstDesc = get(Opcode);
7968 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7969 MRI.getRegClass(Src0.getReg()) :
7970 &AMDGPU::SGPR_32RegClass;
7971
7972 const TargetRegisterClass *Src0SubRC =
7973 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7974 const TargetRegisterClass *Src1RC = Src1.isReg() ?
7975 MRI.getRegClass(Src1.getReg()) :
7976 &AMDGPU::SGPR_32RegClass;
7977
7978 const TargetRegisterClass *Src1SubRC =
7979 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7980
7981 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7982 AMDGPU::sub0, Src0SubRC);
7983 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
7984 AMDGPU::sub0, Src1SubRC);
7985 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7986 AMDGPU::sub1, Src0SubRC);
7987 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
7988 AMDGPU::sub1, Src1SubRC);
7989
7990 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7991 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7992 const TargetRegisterClass *NewDestSubRC =
7993 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7994
7995 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7996 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
7997 .add(SrcReg0Sub0)
7998 .add(SrcReg1Sub0);
7999
8000 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8001 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8002 .add(SrcReg0Sub1)
8003 .add(SrcReg1Sub1);
8004
8005 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8006 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8007 .addReg(DestSub0)
8008 .addImm(AMDGPU::sub0)
8009 .addReg(DestSub1)
8010 .addImm(AMDGPU::sub1);
8011
8012 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8013
8014 Worklist.insert(&LoHalf);
8015 Worklist.insert(&HiHalf);
8016
8017 // Move all users of this moved value.
8018 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8019}
8020
8021void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8022 MachineInstr &Inst,
8023 MachineDominatorTree *MDT) const {
8024 MachineBasicBlock &MBB = *Inst.getParent();
8026
8027 MachineOperand &Dest = Inst.getOperand(0);
8028 MachineOperand &Src0 = Inst.getOperand(1);
8029 MachineOperand &Src1 = Inst.getOperand(2);
8030 const DebugLoc &DL = Inst.getDebugLoc();
8031
8032 MachineBasicBlock::iterator MII = Inst;
8033
8034 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8035
8036 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8037
8038 MachineOperand* Op0;
8039 MachineOperand* Op1;
8040
8041 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8042 Op0 = &Src0;
8043 Op1 = &Src1;
8044 } else {
8045 Op0 = &Src1;
8046 Op1 = &Src0;
8047 }
8048
8049 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8050 .add(*Op0);
8051
8052 Register NewDest = MRI.createVirtualRegister(DestRC);
8053
8054 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8055 .addReg(Interm)
8056 .add(*Op1);
8057
8058 MRI.replaceRegWith(Dest.getReg(), NewDest);
8059
8060 Worklist.insert(&Xor);
8061}
8062
8063void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8064 MachineInstr &Inst) const {
8065 MachineBasicBlock &MBB = *Inst.getParent();
8067
8068 MachineBasicBlock::iterator MII = Inst;
8069 const DebugLoc &DL = Inst.getDebugLoc();
8070
8071 MachineOperand &Dest = Inst.getOperand(0);
8072 MachineOperand &Src = Inst.getOperand(1);
8073
8074 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8075 const TargetRegisterClass *SrcRC = Src.isReg() ?
8076 MRI.getRegClass(Src.getReg()) :
8077 &AMDGPU::SGPR_32RegClass;
8078
8079 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8080 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8081
8082 const TargetRegisterClass *SrcSubRC =
8083 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8084
8085 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8086 AMDGPU::sub0, SrcSubRC);
8087 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8088 AMDGPU::sub1, SrcSubRC);
8089
8090 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8091
8092 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8093
8094 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8095
8096 // We don't need to legalize operands here. src0 for either instruction can be
8097 // an SGPR, and the second input is unused or determined here.
8098 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8099}
8100
8101void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8102 MachineInstr &Inst) const {
8103 MachineBasicBlock &MBB = *Inst.getParent();
8105 MachineBasicBlock::iterator MII = Inst;
8106 const DebugLoc &DL = Inst.getDebugLoc();
8107
8108 MachineOperand &Dest = Inst.getOperand(0);
8109 uint32_t Imm = Inst.getOperand(2).getImm();
8110 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8111 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8112
8113 (void) Offset;
8114
8115 // Only sext_inreg cases handled.
8116 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8117 Offset == 0 && "Not implemented");
8118
8119 if (BitWidth < 32) {
8120 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8121 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8122 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8123
8124 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8125 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8126 .addImm(0)
8127 .addImm(BitWidth);
8128
8129 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8130 .addImm(31)
8131 .addReg(MidRegLo);
8132
8133 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8134 .addReg(MidRegLo)
8135 .addImm(AMDGPU::sub0)
8136 .addReg(MidRegHi)
8137 .addImm(AMDGPU::sub1);
8138
8139 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8140 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8141 return;
8142 }
8143
8144 MachineOperand &Src = Inst.getOperand(1);
8145 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8146 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8147
8148 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8149 .addImm(31)
8150 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8151
8152 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8153 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8154 .addImm(AMDGPU::sub0)
8155 .addReg(TmpReg)
8156 .addImm(AMDGPU::sub1);
8157
8158 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8159 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8160}
8161
8162void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8163 MachineInstr &Inst, unsigned Opcode,
8164 MachineDominatorTree *MDT) const {
8165 // (S_FLBIT_I32_B64 hi:lo) ->
8166 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8167 // (S_FF1_I32_B64 hi:lo) ->
8168 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8169
8170 MachineBasicBlock &MBB = *Inst.getParent();
8172 MachineBasicBlock::iterator MII = Inst;
8173 const DebugLoc &DL = Inst.getDebugLoc();
8174
8175 MachineOperand &Dest = Inst.getOperand(0);
8176 MachineOperand &Src = Inst.getOperand(1);
8177
8178 const MCInstrDesc &InstDesc = get(Opcode);
8179
8180 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8181 unsigned OpcodeAdd =
8182 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8183
8184 const TargetRegisterClass *SrcRC =
8185 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8186 const TargetRegisterClass *SrcSubRC =
8187 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8188
8189 MachineOperand SrcRegSub0 =
8190 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8191 MachineOperand SrcRegSub1 =
8192 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8193
8194 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8195 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8196 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8197 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8198
8199 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8200
8201 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8202
8203 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8204 .addReg(IsCtlz ? MidReg1 : MidReg2)
8205 .addImm(32)
8206 .addImm(1); // enable clamp
8207
8208 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8209 .addReg(MidReg3)
8210 .addReg(IsCtlz ? MidReg2 : MidReg1);
8211
8212 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8213
8214 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8215}
8216
8217void SIInstrInfo::addUsersToMoveToVALUWorklist(
8219 SIInstrWorklist &Worklist) const {
8220 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8221 E = MRI.use_end(); I != E;) {
8222 MachineInstr &UseMI = *I->getParent();
8223
8224 unsigned OpNo = 0;
8225
8226 switch (UseMI.getOpcode()) {
8227 case AMDGPU::COPY:
8228 case AMDGPU::WQM:
8229 case AMDGPU::SOFT_WQM:
8230 case AMDGPU::STRICT_WWM:
8231 case AMDGPU::STRICT_WQM:
8232 case AMDGPU::REG_SEQUENCE:
8233 case AMDGPU::PHI:
8234 case AMDGPU::INSERT_SUBREG:
8235 break;
8236 default:
8237 OpNo = I.getOperandNo();
8238 break;
8239 }
8240
8241 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8242 Worklist.insert(&UseMI);
8243
8244 do {
8245 ++I;
8246 } while (I != E && I->getParent() == &UseMI);
8247 } else {
8248 ++I;
8249 }
8250 }
8251}
8252
8253void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8255 MachineInstr &Inst) const {
8256 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8258 MachineOperand &Src0 = Inst.getOperand(1);
8259 MachineOperand &Src1 = Inst.getOperand(2);
8260 const DebugLoc &DL = Inst.getDebugLoc();
8261
8262 switch (Inst.getOpcode()) {
8263 case AMDGPU::S_PACK_LL_B32_B16: {
8264 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8265 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8266
8267 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8268 // 0.
8269 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8270 .addImm(0xffff);
8271
8272 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8273 .addReg(ImmReg, RegState::Kill)
8274 .add(Src0);
8275
8276 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8277 .add(Src1)
8278 .addImm(16)
8279 .addReg(TmpReg, RegState::Kill);
8280 break;
8281 }
8282 case AMDGPU::S_PACK_LH_B32_B16: {
8283 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8284 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8285 .addImm(0xffff);
8286 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8287 .addReg(ImmReg, RegState::Kill)
8288 .add(Src0)
8289 .add(Src1);
8290 break;
8291 }
8292 case AMDGPU::S_PACK_HL_B32_B16: {
8293 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8294 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8295 .addImm(16)
8296 .add(Src0);
8297 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8298 .add(Src1)
8299 .addImm(16)
8300 .addReg(TmpReg, RegState::Kill);
8301 break;
8302 }
8303 case AMDGPU::S_PACK_HH_B32_B16: {
8304 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8305 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8306 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8307 .addImm(16)
8308 .add(Src0);
8309 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8310 .addImm(0xffff0000);
8311 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8312 .add(Src1)
8313 .addReg(ImmReg, RegState::Kill)
8314 .addReg(TmpReg, RegState::Kill);
8315 break;
8316 }
8317 default:
8318 llvm_unreachable("unhandled s_pack_* instruction");
8319 }
8320
8321 MachineOperand &Dest = Inst.getOperand(0);
8322 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8323 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8324}
8325
8326void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8327 MachineInstr &SCCDefInst,
8328 SIInstrWorklist &Worklist,
8329 Register NewCond) const {
8330
8331 // Ensure that def inst defines SCC, which is still live.
8332 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8333 !Op.isDead() && Op.getParent() == &SCCDefInst);
8334 SmallVector<MachineInstr *, 4> CopyToDelete;
8335 // This assumes that all the users of SCC are in the same block
8336 // as the SCC def.
8337 for (MachineInstr &MI : // Skip the def inst itself.
8338 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8339 SCCDefInst.getParent()->end())) {
8340 // Check if SCC is used first.
8341 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI);
8342 if (SCCIdx != -1) {
8343 if (MI.isCopy()) {
8344 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8345 Register DestReg = MI.getOperand(0).getReg();
8346
8347 MRI.replaceRegWith(DestReg, NewCond);
8348 CopyToDelete.push_back(&MI);
8349 } else {
8350
8351 if (NewCond.isValid())
8352 MI.getOperand(SCCIdx).setReg(NewCond);
8353
8354 Worklist.insert(&MI);
8355 }
8356 }
8357 // Exit if we find another SCC def.
8358 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
8359 break;
8360 }
8361 for (auto &Copy : CopyToDelete)
8362 Copy->eraseFromParent();
8363}
8364
8365// Instructions that use SCC may be converted to VALU instructions. When that
8366// happens, the SCC register is changed to VCC_LO. The instruction that defines
8367// SCC must be changed to an instruction that defines VCC. This function makes
8368// sure that the instruction that defines SCC is added to the moveToVALU
8369// worklist.
8370void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8371 SIInstrWorklist &Worklist) const {
8372 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8373 // then there is nothing to do because the defining instruction has been
8374 // converted to a VALU already. If SCC then that instruction needs to be
8375 // converted to a VALU.
8376 for (MachineInstr &MI :
8377 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8378 SCCUseInst->getParent()->rend())) {
8379 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8380 break;
8381 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8382 Worklist.insert(&MI);
8383 break;
8384 }
8385 }
8386}
8387
8388const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8389 const MachineInstr &Inst) const {
8390 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8391
8392 switch (Inst.getOpcode()) {
8393 // For target instructions, getOpRegClass just returns the virtual register
8394 // class associated with the operand, so we need to find an equivalent VGPR
8395 // register class in order to move the instruction to the VALU.
8396 case AMDGPU::COPY:
8397 case AMDGPU::PHI:
8398 case AMDGPU::REG_SEQUENCE:
8399 case AMDGPU::INSERT_SUBREG:
8400 case AMDGPU::WQM:
8401 case AMDGPU::SOFT_WQM:
8402 case AMDGPU::STRICT_WWM:
8403 case AMDGPU::STRICT_WQM: {
8404 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8405 if (RI.isAGPRClass(SrcRC)) {
8406 if (RI.isAGPRClass(NewDstRC))
8407 return nullptr;
8408
8409 switch (Inst.getOpcode()) {
8410 case AMDGPU::PHI:
8411 case AMDGPU::REG_SEQUENCE:
8412 case AMDGPU::INSERT_SUBREG:
8413 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8414 break;
8415 default:
8416 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8417 }
8418
8419 if (!NewDstRC)
8420 return nullptr;
8421 } else {
8422 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8423 return nullptr;
8424
8425 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8426 if (!NewDstRC)
8427 return nullptr;
8428 }
8429
8430 return NewDstRC;
8431 }
8432 default:
8433 return NewDstRC;
8434 }
8435}
8436
8437// Find the one SGPR operand we are allowed to use.
8438Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8439 int OpIndices[3]) const {
8440 const MCInstrDesc &Desc = MI.getDesc();
8441
8442 // Find the one SGPR operand we are allowed to use.
8443 //
8444 // First we need to consider the instruction's operand requirements before
8445 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8446 // of VCC, but we are still bound by the constant bus requirement to only use
8447 // one.
8448 //
8449 // If the operand's class is an SGPR, we can never move it.
8450
8451 Register SGPRReg = findImplicitSGPRRead(MI);
8452 if (SGPRReg)
8453 return SGPRReg;
8454
8455 Register UsedSGPRs[3] = {Register()};
8456 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8457
8458 for (unsigned i = 0; i < 3; ++i) {
8459 int Idx = OpIndices[i];
8460 if (Idx == -1)
8461 break;
8462
8463 const MachineOperand &MO = MI.getOperand(Idx);
8464 if (!MO.isReg())
8465 continue;
8466
8467 // Is this operand statically required to be an SGPR based on the operand
8468 // constraints?
8469 const TargetRegisterClass *OpRC =
8470 RI.getRegClass(Desc.operands()[Idx].RegClass);
8471 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8472 if (IsRequiredSGPR)
8473 return MO.getReg();
8474
8475 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8476 Register Reg = MO.getReg();
8477 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8478 if (RI.isSGPRClass(RegRC))
8479 UsedSGPRs[i] = Reg;
8480 }
8481
8482 // We don't have a required SGPR operand, so we have a bit more freedom in
8483 // selecting operands to move.
8484
8485 // Try to select the most used SGPR. If an SGPR is equal to one of the
8486 // others, we choose that.
8487 //
8488 // e.g.
8489 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8490 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8491
8492 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8493 // prefer those.
8494
8495 if (UsedSGPRs[0]) {
8496 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8497 SGPRReg = UsedSGPRs[0];
8498 }
8499
8500 if (!SGPRReg && UsedSGPRs[1]) {
8501 if (UsedSGPRs[1] == UsedSGPRs[2])
8502 SGPRReg = UsedSGPRs[1];
8503 }
8504
8505 return SGPRReg;
8506}
8507
8509 unsigned OperandName) const {
8510 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8511 if (Idx == -1)
8512 return nullptr;
8513
8514 return &MI.getOperand(Idx);
8515}
8516
8522 return (Format << 44) |
8523 (1ULL << 56) | // RESOURCE_LEVEL = 1
8524 (3ULL << 60); // OOB_SELECT = 3
8525 }
8526
8527 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8528 if (ST.isAmdHsaOS()) {
8529 // Set ATC = 1. GFX9 doesn't have this bit.
8531 RsrcDataFormat |= (1ULL << 56);
8532
8533 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8534 // BTW, it disables TC L2 and therefore decreases performance.
8536 RsrcDataFormat |= (2ULL << 59);
8537 }
8538
8539 return RsrcDataFormat;
8540}
8541
8545 0xffffffff; // Size;
8546
8547 // GFX9 doesn't have ELEMENT_SIZE.
8549 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8550 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8551 }
8552
8553 // IndexStride = 64 / 32.
8554 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
8555 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8556
8557 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8558 // Clear them unless we want a huge stride.
8561 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8562
8563 return Rsrc23;
8564}
8565
8567 unsigned Opc = MI.getOpcode();
8568
8569 return isSMRD(Opc);
8570}
8571
8573 return get(Opc).mayLoad() &&
8574 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8575}
8576
8578 int &FrameIndex) const {
8579 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8580 if (!Addr || !Addr->isFI())
8581 return Register();
8582
8583 assert(!MI.memoperands_empty() &&
8584 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8585
8586 FrameIndex = Addr->getIndex();
8587 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8588}
8589
8591 int &FrameIndex) const {
8592 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8593 assert(Addr && Addr->isFI());
8594 FrameIndex = Addr->getIndex();
8595 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8596}
8597
8599 int &FrameIndex) const {
8600 if (!MI.mayLoad())
8601 return Register();
8602
8603 if (isMUBUF(MI) || isVGPRSpill(MI))
8604 return isStackAccess(MI, FrameIndex);
8605
8606 if (isSGPRSpill(MI))
8607 return isSGPRStackAccess(MI, FrameIndex);
8608
8609 return Register();
8610}
8611
8613 int &FrameIndex) const {
8614 if (!MI.mayStore())
8615 return Register();
8616
8617 if (isMUBUF(MI) || isVGPRSpill(MI))
8618 return isStackAccess(MI, FrameIndex);
8619
8620 if (isSGPRSpill(MI))
8621 return isSGPRStackAccess(MI, FrameIndex);
8622
8623 return Register();
8624}
8625
8627 unsigned Size = 0;
8629 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8630 while (++I != E && I->isInsideBundle()) {
8631 assert(!I->isBundle() && "No nested bundle!");
8633 }
8634
8635 return Size;
8636}
8637
8639 unsigned Opc = MI.getOpcode();
8641 unsigned DescSize = Desc.getSize();
8642
8643 // If we have a definitive size, we can use it. Otherwise we need to inspect
8644 // the operands to know the size.
8645 if (isFixedSize(MI)) {
8646 unsigned Size = DescSize;
8647
8648 // If we hit the buggy offset, an extra nop will be inserted in MC so
8649 // estimate the worst case.
8650 if (MI.isBranch() && ST.hasOffset3fBug())
8651 Size += 4;
8652
8653 return Size;
8654 }
8655
8656 // Instructions may have a 32-bit literal encoded after them. Check
8657 // operands that could ever be literals.
8658 if (isVALU(MI) || isSALU(MI)) {
8659 if (isDPP(MI))
8660 return DescSize;
8661 bool HasLiteral = false;
8662 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8663 const MachineOperand &Op = MI.getOperand(I);
8664 const MCOperandInfo &OpInfo = Desc.operands()[I];
8665 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8666 HasLiteral = true;
8667 break;
8668 }
8669 }
8670 return HasLiteral ? DescSize + 4 : DescSize;
8671 }
8672
8673 // Check whether we have extra NSA words.
8674 if (isMIMG(MI)) {
8675 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8676 if (VAddr0Idx < 0)
8677 return 8;
8678
8679 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8680 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8681 }
8682
8683 switch (Opc) {
8684 case TargetOpcode::BUNDLE:
8685 return getInstBundleSize(MI);
8686 case TargetOpcode::INLINEASM:
8687 case TargetOpcode::INLINEASM_BR: {
8688 const MachineFunction *MF = MI.getParent()->getParent();
8689 const char *AsmStr = MI.getOperand(0).getSymbolName();
8690 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8691 }
8692 default:
8693 if (MI.isMetaInstruction())
8694 return 0;
8695 return DescSize;
8696 }
8697}
8698
8700 if (!isFLAT(MI))
8701 return false;
8702
8703 if (MI.memoperands_empty())
8704 return true;
8705
8706 for (const MachineMemOperand *MMO : MI.memoperands()) {
8707 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8708 return true;
8709 }
8710 return false;
8711}
8712
8714 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
8715}
8716
8718 MachineBasicBlock *IfEnd) const {
8720 assert(TI != IfEntry->end());
8721
8722 MachineInstr *Branch = &(*TI);
8723 MachineFunction *MF = IfEntry->getParent();
8725
8726 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8727 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8728 MachineInstr *SIIF =
8729 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
8730 .add(Branch->getOperand(0))
8731 .add(Branch->getOperand(1));
8732 MachineInstr *SIEND =
8733 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
8734 .addReg(DstReg);
8735
8736 IfEntry->erase(TI);
8737 IfEntry->insert(IfEntry->end(), SIIF);
8738 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
8739 }
8740}
8741
8743 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
8745 // We expect 2 terminators, one conditional and one unconditional.
8746 assert(TI != LoopEnd->end());
8747
8748 MachineInstr *Branch = &(*TI);
8749 MachineFunction *MF = LoopEnd->getParent();
8751
8752 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8753
8754 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8755 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
8756 MachineInstrBuilder HeaderPHIBuilder =
8757 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
8758 for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) {
8759 if (PMBB == LoopEnd) {
8760 HeaderPHIBuilder.addReg(BackEdgeReg);
8761 } else {
8762 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
8763 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
8764 ZeroReg, 0);
8765 HeaderPHIBuilder.addReg(ZeroReg);
8766 }
8767 HeaderPHIBuilder.addMBB(PMBB);
8768 }
8769 MachineInstr *HeaderPhi = HeaderPHIBuilder;
8770 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
8771 get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
8772 .addReg(DstReg)
8773 .add(Branch->getOperand(0));
8774 MachineInstr *SILOOP =
8775 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
8776 .addReg(BackEdgeReg)
8777 .addMBB(LoopEntry);
8778
8779 LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
8780 LoopEnd->erase(TI);
8781 LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
8782 LoopEnd->insert(LoopEnd->end(), SILOOP);
8783 }
8784}
8785
8788 static const std::pair<int, const char *> TargetIndices[] = {
8789 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8790 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8791 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8792 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8793 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8794 return ArrayRef(TargetIndices);
8795}
8796
8797/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8798/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8801 const ScheduleDAG *DAG) const {
8802 return new GCNHazardRecognizer(DAG->MF);
8803}
8804
8805/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8806/// pass.
8809 return new GCNHazardRecognizer(MF);
8810}
8811
8812// Called during:
8813// - pre-RA scheduling and post-RA scheduling
8816 const ScheduleDAGMI *DAG) const {
8817 // Borrowed from Arm Target
8818 // We would like to restrict this hazard recognizer to only
8819 // post-RA scheduling; we can tell that we're post-RA because we don't
8820 // track VRegLiveness.
8821 if (!DAG->hasVRegLiveness())
8822 return new GCNHazardRecognizer(DAG->MF);
8824}
8825
8826std::pair<unsigned, unsigned>
8828 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
8829}
8830
8833 static const std::pair<unsigned, const char *> TargetFlags[] = {
8834 { MO_GOTPCREL, "amdgpu-gotprel" },
8835 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
8836 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
8837 { MO_REL32_LO, "amdgpu-rel32-lo" },
8838 { MO_REL32_HI, "amdgpu-rel32-hi" },
8839 { MO_ABS32_LO, "amdgpu-abs32-lo" },
8840 { MO_ABS32_HI, "amdgpu-abs32-hi" },
8841 };
8842
8843 return ArrayRef(TargetFlags);
8844}
8845
8848 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8849 {
8850 {MONoClobber, "amdgpu-noclobber"},
8851 {MOLastUse, "amdgpu-last-use"},
8852 };
8853
8854 return ArrayRef(TargetFlags);
8855}
8856
8858 const MachineFunction &MF) const {
8860 assert(SrcReg.isVirtual());
8861 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8862 return AMDGPU::WWM_COPY;
8863
8864 return AMDGPU::COPY;
8865}
8866
8868 Register Reg) const {
8869 // We need to handle instructions which may be inserted during register
8870 // allocation to handle the prolog. The initial prolog instruction may have
8871 // been separated from the start of the block by spills and copies inserted
8872 // needed by the prolog. However, the insertions for scalar registers can
8873 // always be placed at the BB top as they are independent of the exec mask
8874 // value.
8875 bool IsNullOrVectorRegister = true;
8876 if (Reg) {
8877 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8878 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
8879 }
8880
8881 uint16_t Opcode = MI.getOpcode();
8882 // FIXME: Copies inserted in the block prolog for live-range split should also
8883 // be included.
8884 return IsNullOrVectorRegister &&
8885 (isSpill(Opcode) || (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
8886 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
8887}
8888
8892 const DebugLoc &DL,
8893 Register DestReg) const {
8894 if (ST.hasAddNoCarry())
8895 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
8896
8898 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
8899 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
8900
8901 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8902 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8903}
8904
8907 const DebugLoc &DL,
8908 Register DestReg,
8909 RegScavenger &RS) const {
8910 if (ST.hasAddNoCarry())
8911 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
8912
8913 // If available, prefer to use vcc.
8914 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
8915 ? Register(RI.getVCC())
8917 *RI.getBoolRC(), I, /* RestoreAfter */ false,
8918 0, /* AllowSpill */ false);
8919
8920 // TODO: Users need to deal with this.
8921 if (!UnusedCarry.isValid())
8922 return MachineInstrBuilder();
8923
8924 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8925 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8926}
8927
8928bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
8929 switch (Opcode) {
8930 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
8931 case AMDGPU::SI_KILL_I1_TERMINATOR:
8932 return true;
8933 default:
8934 return false;
8935 }
8936}
8937
8939 switch (Opcode) {
8940 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
8941 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
8942 case AMDGPU::SI_KILL_I1_PSEUDO:
8943 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
8944 default:
8945 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
8946 }
8947}
8948
8949bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
8950 return Imm <= getMaxMUBUFImmOffset(ST);
8951}
8952
8954 // GFX12 field is non-negative 24-bit signed byte offset.
8955 const unsigned OffsetBits =
8956 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
8957 return (1 << OffsetBits) - 1;
8958}
8959
8961 if (!ST.isWave32())
8962 return;
8963
8964 if (MI.isInlineAsm())
8965 return;
8966
8967 for (auto &Op : MI.implicit_operands()) {
8968 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
8969 Op.setReg(AMDGPU::VCC_LO);
8970 }
8971}
8972
8974 if (!isSMRD(MI))
8975 return false;
8976
8977 // Check that it is using a buffer resource.
8978 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
8979 if (Idx == -1) // e.g. s_memtime
8980 return false;
8981
8982 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
8983 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
8984}
8985
8986// Given Imm, split it into the values to put into the SOffset and ImmOffset
8987// fields in an MUBUF instruction. Return false if it is not possible (due to a
8988// hardware bug needing a workaround).
8989//
8990// The required alignment ensures that individual address components remain
8991// aligned if they are aligned to begin with. It also ensures that additional
8992// offsets within the given alignment can be added to the resulting ImmOffset.
8994 uint32_t &ImmOffset, Align Alignment) const {
8995 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
8996 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
8997 uint32_t Overflow = 0;
8998
8999 if (Imm > MaxImm) {
9000 if (Imm <= MaxImm + 64) {
9001 // Use an SOffset inline constant for 4..64
9002 Overflow = Imm - MaxImm;
9003 Imm = MaxImm;
9004 } else {
9005 // Try to keep the same value in SOffset for adjacent loads, so that
9006 // the corresponding register contents can be re-used.
9007 //
9008 // Load values with all low-bits (except for alignment bits) set into
9009 // SOffset, so that a larger range of values can be covered using
9010 // s_movk_i32.
9011 //
9012 // Atomic operations fail to work correctly when individual address
9013 // components are unaligned, even if their sum is aligned.
9014 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9015 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9016 Imm = Low;
9017 Overflow = High - Alignment.value();
9018 }
9019 }
9020
9021 if (Overflow > 0) {
9022 // There is a hardware bug in SI and CI which prevents address clamping in
9023 // MUBUF instructions from working correctly with SOffsets. The immediate
9024 // offset is unaffected.
9026 return false;
9027
9028 // It is not possible to set immediate in SOffset field on some targets.
9029 if (ST.hasRestrictedSOffset())
9030 return false;
9031 }
9032
9033 ImmOffset = Imm;
9034 SOffset = Overflow;
9035 return true;
9036}
9037
9038// Depending on the used address space and instructions, some immediate offsets
9039// are allowed and some are not.
9040// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9041// scratch instruction offsets can also be negative. On GFX12, offsets can be
9042// negative for all variants.
9043//
9044// There are several bugs related to these offsets:
9045// On gfx10.1, flat instructions that go into the global address space cannot
9046// use an offset.
9047//
9048// For scratch instructions, the address can be either an SGPR or a VGPR.
9049// The following offsets can be used, depending on the architecture (x means
9050// cannot be used):
9051// +----------------------------+------+------+
9052// | Address-Mode | SGPR | VGPR |
9053// +----------------------------+------+------+
9054// | gfx9 | | |
9055// | negative, 4-aligned offset | x | ok |
9056// | negative, unaligned offset | x | ok |
9057// +----------------------------+------+------+
9058// | gfx10 | | |
9059// | negative, 4-aligned offset | ok | ok |
9060// | negative, unaligned offset | ok | x |
9061// +----------------------------+------+------+
9062// | gfx10.3 | | |
9063// | negative, 4-aligned offset | ok | ok |
9064// | negative, unaligned offset | ok | ok |
9065// +----------------------------+------+------+
9066//
9067// This function ignores the addressing mode, so if an offset cannot be used in
9068// one addressing mode, it is considered illegal.
9069bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9070 uint64_t FlatVariant) const {
9071 // TODO: Should 0 be special cased?
9072 if (!ST.hasFlatInstOffsets())
9073 return false;
9074
9075 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9076 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9077 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9078 return false;
9079
9081 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9082 (Offset % 4) != 0) {
9083 return false;
9084 }
9085
9086 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9087 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9088 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9089}
9090
9091// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9092std::pair<int64_t, int64_t>
9093SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9094 uint64_t FlatVariant) const {
9095 int64_t RemainderOffset = COffsetVal;
9096 int64_t ImmField = 0;
9097
9098 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9099 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9100
9101 if (AllowNegative) {
9102 // Use signed division by a power of two to truncate towards 0.
9103 int64_t D = 1LL << NumBits;
9104 RemainderOffset = (COffsetVal / D) * D;
9105 ImmField = COffsetVal - RemainderOffset;
9106
9108 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9109 (ImmField % 4) != 0) {
9110 // Make ImmField a multiple of 4
9111 RemainderOffset += ImmField % 4;
9112 ImmField -= ImmField % 4;
9113 }
9114 } else if (COffsetVal >= 0) {
9115 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9116 RemainderOffset = COffsetVal - ImmField;
9117 }
9118
9119 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9120 assert(RemainderOffset + ImmField == COffsetVal);
9121 return {ImmField, RemainderOffset};
9122}
9123
9125 if (ST.hasNegativeScratchOffsetBug() &&
9126 FlatVariant == SIInstrFlags::FlatScratch)
9127 return false;
9128
9129 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9130}
9131
9132static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9133 switch (ST.getGeneration()) {
9134 default:
9135 break;
9138 return SIEncodingFamily::SI;
9141 return SIEncodingFamily::VI;
9148 }
9149 llvm_unreachable("Unknown subtarget generation!");
9150}
9151
9152bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9153 switch(MCOp) {
9154 // These opcodes use indirect register addressing so
9155 // they need special handling by codegen (currently missing).
9156 // Therefore it is too risky to allow these opcodes
9157 // to be selected by dpp combiner or sdwa peepholer.
9158 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9159 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9160 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9161 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9162 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9163 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9164 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9165 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9166 return true;
9167 default:
9168 return false;
9169 }
9170}
9171
9172int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9173 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9174
9175 unsigned Gen = subtargetEncodingFamily(ST);
9176
9177 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
9180
9181 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9182 // subtarget has UnpackedD16VMem feature.
9183 // TODO: remove this when we discard GFX80 encoding.
9184 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9186
9187 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9188 switch (ST.getGeneration()) {
9189 default:
9191 break;
9194 break;
9197 break;
9198 }
9199 }
9200
9201 if (isMAI(Opcode)) {
9202 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9203 if (MFMAOp != -1)
9204 Opcode = MFMAOp;
9205 }
9206
9207 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9208
9209 // -1 means that Opcode is already a native instruction.
9210 if (MCOp == -1)
9211 return Opcode;
9212
9213 if (ST.hasGFX90AInsts()) {
9214 uint16_t NMCOp = (uint16_t)-1;
9215 if (ST.hasGFX940Insts())
9217 if (NMCOp == (uint16_t)-1)
9219 if (NMCOp == (uint16_t)-1)
9221 if (NMCOp != (uint16_t)-1)
9222 MCOp = NMCOp;
9223 }
9224
9225 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9226 // no encoding in the given subtarget generation.
9227 if (MCOp == (uint16_t)-1)
9228 return -1;
9229
9230 if (isAsmOnlyOpcode(MCOp))
9231 return -1;
9232
9233 return MCOp;
9234}
9235
9236static
9238 assert(RegOpnd.isReg());
9239 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9240 getRegSubRegPair(RegOpnd);
9241}
9242
9245 assert(MI.isRegSequence());
9246 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9247 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9248 auto &RegOp = MI.getOperand(1 + 2 * I);
9249 return getRegOrUndef(RegOp);
9250 }
9252}
9253
9254// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9255// Following a subreg of reg:subreg isn't supported
9258 if (!RSR.SubReg)
9259 return false;
9260 switch (MI.getOpcode()) {
9261 default: break;
9262 case AMDGPU::REG_SEQUENCE:
9263 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9264 return true;
9265 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9266 case AMDGPU::INSERT_SUBREG:
9267 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9268 // inserted the subreg we're looking for
9269 RSR = getRegOrUndef(MI.getOperand(2));
9270 else { // the subreg in the rest of the reg
9271 auto R1 = getRegOrUndef(MI.getOperand(1));
9272 if (R1.SubReg) // subreg of subreg isn't supported
9273 return false;
9274 RSR.Reg = R1.Reg;
9275 }
9276 return true;
9277 }
9278 return false;
9279}
9280
9283 assert(MRI.isSSA());
9284 if (!P.Reg.isVirtual())
9285 return nullptr;
9286
9287 auto RSR = P;
9288 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9289 while (auto *MI = DefInst) {
9290 DefInst = nullptr;
9291 switch (MI->getOpcode()) {
9292 case AMDGPU::COPY:
9293 case AMDGPU::V_MOV_B32_e32: {
9294 auto &Op1 = MI->getOperand(1);
9295 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9296 if (Op1.isUndef())
9297 return nullptr;
9298 RSR = getRegSubRegPair(Op1);
9299 DefInst = MRI.getVRegDef(RSR.Reg);
9300 }
9301 break;
9302 }
9303 default:
9304 if (followSubRegDef(*MI, RSR)) {
9305 if (!RSR.Reg)
9306 return nullptr;
9307 DefInst = MRI.getVRegDef(RSR.Reg);
9308 }
9309 }
9310 if (!DefInst)
9311 return MI;
9312 }
9313 return nullptr;
9314}
9315
9317 Register VReg,
9318 const MachineInstr &DefMI,
9319 const MachineInstr &UseMI) {
9320 assert(MRI.isSSA() && "Must be run on SSA");
9321
9322 auto *TRI = MRI.getTargetRegisterInfo();
9323 auto *DefBB = DefMI.getParent();
9324
9325 // Don't bother searching between blocks, although it is possible this block
9326 // doesn't modify exec.
9327 if (UseMI.getParent() != DefBB)
9328 return true;
9329
9330 const int MaxInstScan = 20;
9331 int NumInst = 0;
9332
9333 // Stop scan at the use.
9334 auto E = UseMI.getIterator();
9335 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9336 if (I->isDebugInstr())
9337 continue;
9338
9339 if (++NumInst > MaxInstScan)
9340 return true;
9341
9342 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9343 return true;
9344 }
9345
9346 return false;
9347}
9348
9350 Register VReg,
9351 const MachineInstr &DefMI) {
9352 assert(MRI.isSSA() && "Must be run on SSA");
9353
9354 auto *TRI = MRI.getTargetRegisterInfo();
9355 auto *DefBB = DefMI.getParent();
9356
9357 const int MaxUseScan = 10;
9358 int NumUse = 0;
9359
9360 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9361 auto &UseInst = *Use.getParent();
9362 // Don't bother searching between blocks, although it is possible this block
9363 // doesn't modify exec.
9364 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9365 return true;
9366
9367 if (++NumUse > MaxUseScan)
9368 return true;
9369 }
9370
9371 if (NumUse == 0)
9372 return false;
9373
9374 const int MaxInstScan = 20;
9375 int NumInst = 0;
9376
9377 // Stop scan when we have seen all the uses.
9378 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9379 assert(I != DefBB->end());
9380
9381 if (I->isDebugInstr())
9382 continue;
9383
9384 if (++NumInst > MaxInstScan)
9385 return true;
9386
9387 for (const MachineOperand &Op : I->operands()) {
9388 // We don't check reg masks here as they're used only on calls:
9389 // 1. EXEC is only considered const within one BB
9390 // 2. Call should be a terminator instruction if present in a BB
9391
9392 if (!Op.isReg())
9393 continue;
9394
9395 Register Reg = Op.getReg();
9396 if (Op.isUse()) {
9397 if (Reg == VReg && --NumUse == 0)
9398 return false;
9399 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9400 return true;
9401 }
9402 }
9403}
9404
9407 const DebugLoc &DL, Register Src, Register Dst) const {
9408 auto Cur = MBB.begin();
9409 if (Cur != MBB.end())
9410 do {
9411 if (!Cur->isPHI() && Cur->readsRegister(Dst))
9412 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9413 ++Cur;
9414 } while (Cur != MBB.end() && Cur != LastPHIIt);
9415
9416 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9417 Dst);
9418}
9419
9422 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9423 if (InsPt != MBB.end() &&
9424 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9425 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9426 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9427 InsPt->definesRegister(Src)) {
9428 InsPt++;
9429 return BuildMI(MBB, InsPt, DL,
9430 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9431 : AMDGPU::S_MOV_B64_term),
9432 Dst)
9433 .addReg(Src, 0, SrcSubReg)
9434 .addReg(AMDGPU::EXEC, RegState::Implicit);
9435 }
9436 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9437 Dst);
9438}
9439
9440bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9441
9444 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9445 VirtRegMap *VRM) const {
9446 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9447 //
9448 // %0:sreg_32 = COPY $m0
9449 //
9450 // We explicitly chose SReg_32 for the virtual register so such a copy might
9451 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9452 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9453 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9454 // TargetInstrInfo::foldMemoryOperand() is going to try.
9455 // A similar issue also exists with spilling and reloading $exec registers.
9456 //
9457 // To prevent that, constrain the %0 register class here.
9458 if (isFullCopyInstr(MI)) {
9459 Register DstReg = MI.getOperand(0).getReg();
9460 Register SrcReg = MI.getOperand(1).getReg();
9461 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9462 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9464 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9465 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9466 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9467 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9468 return nullptr;
9469 } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9470 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9471 return nullptr;
9472 }
9473 }
9474 }
9475
9476 return nullptr;
9477}
9478
9480 const MachineInstr &MI,
9481 unsigned *PredCost) const {
9482 if (MI.isBundle()) {
9484 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9485 unsigned Lat = 0, Count = 0;
9486 for (++I; I != E && I->isBundledWithPred(); ++I) {
9487 ++Count;
9488 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9489 }
9490 return Lat + Count - 1;
9491 }
9492
9493 return SchedModel.computeInstrLatency(&MI);
9494}
9495
9498 unsigned opcode = MI.getOpcode();
9499 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9500 auto IID = GI->getIntrinsicID();
9505
9506 switch (IID) {
9507 case Intrinsic::amdgcn_if:
9508 case Intrinsic::amdgcn_else:
9509 // FIXME: Uniform if second result
9510 break;
9511 }
9512
9514 }
9515
9516 // Loads from the private and flat address spaces are divergent, because
9517 // threads can execute the load instruction with the same inputs and get
9518 // different results.
9519 //
9520 // All other loads are not divergent, because if threads issue loads with the
9521 // same arguments, they will always get the same result.
9522 if (opcode == AMDGPU::G_LOAD) {
9523 if (MI.memoperands_empty())
9524 return InstructionUniformity::NeverUniform; // conservative assumption
9525
9526 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9527 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9528 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9529 })) {
9530 // At least one MMO in a non-global address space.
9532 }
9534 }
9535
9536 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9537 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9538 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9539 AMDGPU::isGenericAtomic(opcode)) {
9541 }
9543}
9544
9547
9548 if (isNeverUniform(MI))
9550
9551 unsigned opcode = MI.getOpcode();
9552 if (opcode == AMDGPU::V_READLANE_B32 ||
9553 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9554 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9556
9557 if (isCopyInstr(MI)) {
9558 const MachineOperand &srcOp = MI.getOperand(1);
9559 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9560 const TargetRegisterClass *regClass =
9561 RI.getPhysRegBaseClass(srcOp.getReg());
9564 }
9566 }
9567
9568 // GMIR handling
9569 if (MI.isPreISelOpcode())
9571
9572 // Atomics are divergent because they are executed sequentially: when an
9573 // atomic operation refers to the same address in each thread, then each
9574 // thread after the first sees the value written by the previous thread as
9575 // original value.
9576
9577 if (isAtomic(MI))
9579
9580 // Loads from the private and flat address spaces are divergent, because
9581 // threads can execute the load instruction with the same inputs and get
9582 // different results.
9583 if (isFLAT(MI) && MI.mayLoad()) {
9584 if (MI.memoperands_empty())
9585 return InstructionUniformity::NeverUniform; // conservative assumption
9586
9587 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9588 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9589 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9590 })) {
9591 // At least one MMO in a non-global address space.
9593 }
9594
9596 }
9597
9598 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9599 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9600
9601 // FIXME: It's conceptually broken to report this for an instruction, and not
9602 // a specific def operand. For inline asm in particular, there could be mixed
9603 // uniform and divergent results.
9604 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9605 const MachineOperand &SrcOp = MI.getOperand(I);
9606 if (!SrcOp.isReg())
9607 continue;
9608
9609 Register Reg = SrcOp.getReg();
9610 if (!Reg || !SrcOp.readsReg())
9611 continue;
9612
9613 // If RegBank is null, this is unassigned or an unallocatable special
9614 // register, which are all scalars.
9615 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9616 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9618 }
9619
9620 // TODO: Uniformity check condtions above can be rearranged for more
9621 // redability
9622
9623 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9624 // currently turned into no-op COPYs by SelectionDAG ISel and are
9625 // therefore no longer recognizable.
9626
9628}
9629
9631 switch (MF.getFunction().getCallingConv()) {
9633 return 1;
9635 return 2;
9637 return 3;
9641 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9644 case CallingConv::C:
9645 case CallingConv::Fast:
9646 default:
9647 // Assume other calling conventions are various compute callable functions
9648 return 0;
9649 }
9650}
9651
9653 Register &SrcReg2, int64_t &CmpMask,
9654 int64_t &CmpValue) const {
9655 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9656 return false;
9657
9658 switch (MI.getOpcode()) {
9659 default:
9660 break;
9661 case AMDGPU::S_CMP_EQ_U32:
9662 case AMDGPU::S_CMP_EQ_I32:
9663 case AMDGPU::S_CMP_LG_U32:
9664 case AMDGPU::S_CMP_LG_I32:
9665 case AMDGPU::S_CMP_LT_U32:
9666 case AMDGPU::S_CMP_LT_I32:
9667 case AMDGPU::S_CMP_GT_U32:
9668 case AMDGPU::S_CMP_GT_I32:
9669 case AMDGPU::S_CMP_LE_U32:
9670 case AMDGPU::S_CMP_LE_I32:
9671 case AMDGPU::S_CMP_GE_U32:
9672 case AMDGPU::S_CMP_GE_I32:
9673 case AMDGPU::S_CMP_EQ_U64:
9674 case AMDGPU::S_CMP_LG_U64:
9675 SrcReg = MI.getOperand(0).getReg();
9676 if (MI.getOperand(1).isReg()) {
9677 if (MI.getOperand(1).getSubReg())
9678 return false;
9679 SrcReg2 = MI.getOperand(1).getReg();
9680 CmpValue = 0;
9681 } else if (MI.getOperand(1).isImm()) {
9682 SrcReg2 = Register();
9683 CmpValue = MI.getOperand(1).getImm();
9684 } else {
9685 return false;
9686 }
9687 CmpMask = ~0;
9688 return true;
9689 case AMDGPU::S_CMPK_EQ_U32:
9690 case AMDGPU::S_CMPK_EQ_I32:
9691 case AMDGPU::S_CMPK_LG_U32:
9692 case AMDGPU::S_CMPK_LG_I32:
9693 case AMDGPU::S_CMPK_LT_U32:
9694 case AMDGPU::S_CMPK_LT_I32:
9695 case AMDGPU::S_CMPK_GT_U32:
9696 case AMDGPU::S_CMPK_GT_I32:
9697 case AMDGPU::S_CMPK_LE_U32:
9698 case AMDGPU::S_CMPK_LE_I32:
9699 case AMDGPU::S_CMPK_GE_U32:
9700 case AMDGPU::S_CMPK_GE_I32:
9701 SrcReg = MI.getOperand(0).getReg();
9702 SrcReg2 = Register();
9703 CmpValue = MI.getOperand(1).getImm();
9704 CmpMask = ~0;
9705 return true;
9706 }
9707
9708 return false;
9709}
9710
9712 Register SrcReg2, int64_t CmpMask,
9713 int64_t CmpValue,
9714 const MachineRegisterInfo *MRI) const {
9715 if (!SrcReg || SrcReg.isPhysical())
9716 return false;
9717
9718 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9719 return false;
9720
9721 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9722 this](int64_t ExpectedValue, unsigned SrcSize,
9723 bool IsReversible, bool IsSigned) -> bool {
9724 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9725 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9726 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9727 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9728 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9729 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9730 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9731 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9732 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9733 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9734 //
9735 // Signed ge/gt are not used for the sign bit.
9736 //
9737 // If result of the AND is unused except in the compare:
9738 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9739 //
9740 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9741 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9742 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9743 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9744 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9745 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9746
9747 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9748 if (!Def || Def->getParent() != CmpInstr.getParent())
9749 return false;
9750
9751 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9752 Def->getOpcode() != AMDGPU::S_AND_B64)
9753 return false;
9754
9755 int64_t Mask;
9756 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9757 if (MO->isImm())
9758 Mask = MO->getImm();
9759 else if (!getFoldableImm(MO, Mask))
9760 return false;
9761 Mask &= maxUIntN(SrcSize);
9762 return isPowerOf2_64(Mask);
9763 };
9764
9765 MachineOperand *SrcOp = &Def->getOperand(1);
9766 if (isMask(SrcOp))
9767 SrcOp = &Def->getOperand(2);
9768 else if (isMask(&Def->getOperand(2)))
9769 SrcOp = &Def->getOperand(1);
9770 else
9771 return false;
9772
9773 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
9774 if (IsSigned && BitNo == SrcSize - 1)
9775 return false;
9776
9777 ExpectedValue <<= BitNo;
9778
9779 bool IsReversedCC = false;
9780 if (CmpValue != ExpectedValue) {
9781 if (!IsReversible)
9782 return false;
9783 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
9784 if (!IsReversedCC)
9785 return false;
9786 }
9787
9788 Register DefReg = Def->getOperand(0).getReg();
9789 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
9790 return false;
9791
9792 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
9793 I != E; ++I) {
9794 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
9795 I->killsRegister(AMDGPU::SCC, &RI))
9796 return false;
9797 }
9798
9799 MachineOperand *SccDef = Def->findRegisterDefOperand(AMDGPU::SCC);
9800 SccDef->setIsDead(false);
9801 CmpInstr.eraseFromParent();
9802
9803 if (!MRI->use_nodbg_empty(DefReg)) {
9804 assert(!IsReversedCC);
9805 return true;
9806 }
9807
9808 // Replace AND with unused result with a S_BITCMP.
9809 MachineBasicBlock *MBB = Def->getParent();
9810
9811 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
9812 : AMDGPU::S_BITCMP1_B32
9813 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
9814 : AMDGPU::S_BITCMP1_B64;
9815
9816 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
9817 .add(*SrcOp)
9818 .addImm(BitNo);
9819 Def->eraseFromParent();
9820
9821 return true;
9822 };
9823
9824 switch (CmpInstr.getOpcode()) {
9825 default:
9826 break;
9827 case AMDGPU::S_CMP_EQ_U32:
9828 case AMDGPU::S_CMP_EQ_I32:
9829 case AMDGPU::S_CMPK_EQ_U32:
9830 case AMDGPU::S_CMPK_EQ_I32:
9831 return optimizeCmpAnd(1, 32, true, false);
9832 case AMDGPU::S_CMP_GE_U32:
9833 case AMDGPU::S_CMPK_GE_U32:
9834 return optimizeCmpAnd(1, 32, false, false);
9835 case AMDGPU::S_CMP_GE_I32:
9836 case AMDGPU::S_CMPK_GE_I32:
9837 return optimizeCmpAnd(1, 32, false, true);
9838 case AMDGPU::S_CMP_EQ_U64:
9839 return optimizeCmpAnd(1, 64, true, false);
9840 case AMDGPU::S_CMP_LG_U32:
9841 case AMDGPU::S_CMP_LG_I32:
9842 case AMDGPU::S_CMPK_LG_U32:
9843 case AMDGPU::S_CMPK_LG_I32:
9844 return optimizeCmpAnd(0, 32, true, false);
9845 case AMDGPU::S_CMP_GT_U32:
9846 case AMDGPU::S_CMPK_GT_U32:
9847 return optimizeCmpAnd(0, 32, false, false);
9848 case AMDGPU::S_CMP_GT_I32:
9849 case AMDGPU::S_CMPK_GT_I32:
9850 return optimizeCmpAnd(0, 32, false, true);
9851 case AMDGPU::S_CMP_LG_U64:
9852 return optimizeCmpAnd(0, 64, true, false);
9853 }
9854
9855 return false;
9856}
9857
9859 unsigned OpName) const {
9860 if (!ST.needsAlignedVGPRs())
9861 return;
9862
9863 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
9864 if (OpNo < 0)
9865 return;
9866 MachineOperand &Op = MI.getOperand(OpNo);
9867 if (getOpSize(MI, OpNo) > 4)
9868 return;
9869
9870 // Add implicit aligned super-reg to force alignment on the data operand.
9871 const DebugLoc &DL = MI.getDebugLoc();
9872 MachineBasicBlock *BB = MI.getParent();
9874 Register DataReg = Op.getReg();
9875 bool IsAGPR = RI.isAGPR(MRI, DataReg);
9876 Register Undef = MRI.createVirtualRegister(
9877 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
9878 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
9879 Register NewVR =
9880 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
9881 : &AMDGPU::VReg_64_Align2RegClass);
9882 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
9883 .addReg(DataReg, 0, Op.getSubReg())
9884 .addImm(AMDGPU::sub0)
9885 .addReg(Undef)
9886 .addImm(AMDGPU::sub1);
9887 Op.setReg(NewVR);
9888 Op.setSubReg(AMDGPU::sub0);
9889 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
9890}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
amdgpu AMDGPU Register Bank Select
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
uint64_t High
#define P(N)
TargetInstrInfo::RegSubRegPair RegSubRegPair
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillRestoreOpcode(unsigned Size)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillSaveOpcode(unsigned Size)
static bool resultDependsOnExec(const MachineInstr &MI)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:85
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static constexpr unsigned ModifierOpNames[]
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:76
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:76
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:262
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:342
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:731
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:735
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:944
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:376
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:610
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:751
bool hasMAIInsts() const
Definition: GCNSubtarget.h:801
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasPkMovB32() const
Definition: GCNSubtarget.h:993
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:263
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:283
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:747
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:666
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:739
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:329
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
Generation getGeneration() const
Definition: GCNSubtarget.h:302
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:871
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:718
bool hasAddr64() const
Definition: GCNSubtarget.h:366
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:710
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all of the successor blocks of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:541
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:611
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:621
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:194
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:444
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:438
unsigned short Opcode
Definition: MCInstrDesc.h:205
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:565
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:97
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:91
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:397
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:47
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineDomTreeNode * addNewBlock(MachineBasicBlock *BB, MachineBasicBlock *DomBB)
addNewBlock - Add a new node to the dominator tree information.
bool properlyDominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
void changeImmediateDominator(MachineBasicBlock *N, MachineBasicBlock *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:544
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:327
int findRegisterDefOperandIdx(Register Reg, bool isDead=false, bool Overlap=false, const TargetRegisterInfo *TRI=nullptr) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:547
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
iterator_range< mop_iterator > explicit_operands()
Definition: MachineInstr.h:666
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:790
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:775
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:757
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:473
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:674
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:554
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:372
bool addRegisterDead(Register Reg, const TargetRegisterInfo *RegInfo, bool AddIfNotFound=false)
We have determined MI defined a register without a use.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:792
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
bool isNonUniformBranchInstr(MachineInstr &Instr) const
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:504
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:924
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1094
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1213
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:408
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:496
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:604
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:520
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
bool isSpill(uint16_t Opcode) const
Definition: SIInstrInfo.h:740
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:424
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:464
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:936
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:760
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void materializeImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, int64_t Value) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:716
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:959
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description.
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
void convertNonUniformIfRegion(MachineBasicBlock *IfEntry, MachineBasicBlock *IfEnd) const
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
Whether we must prevent this instruction from executing with EXEC = 0.
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:681
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:863
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:728
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1226
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:880
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:562
unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:66
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:68
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:240
SlotIndexes pass.
Definition: SlotIndexes.h:300
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:523
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1504
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isDPALU_DPP(const MCInstrDesc &OpDesc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1505
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1507
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isHi(unsigned Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2INT32
Definition: SIDefines.h:227
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_AC_V2FP16
Definition: SIDefines.h:246
@ OPERAND_REG_IMM_INT32
Operands with register or 32-bit immediate.
Definition: SIDefines.h:200
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16_DEFERRED
Definition: SIDefines.h:207
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:206
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_AC_BF16
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_AC_INT16
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:204
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_AC_FP16
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_AC_INT32
Definition: SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:242
@ OPERAND_REG_INLINE_AC_V2BF16
Definition: SIDefines.h:245
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:213
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:203
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:251
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:243
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:202
@ OPERAND_REG_INLINE_C_V2FP32
Definition: SIDefines.h:228
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_FP32_DEFERRED
Definition: SIDefines.h:209
@ OPERAND_REG_IMM_FP16_DEFERRED
Definition: SIDefines.h:208
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:409
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:411
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:408
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:410
@ TI_CONSTDATA_START
Definition: AMDGPU.h:407
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1506
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:70
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:68
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:67
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:69
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:60
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:59
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:66
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:71
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:456
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1395
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1731
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:665
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:269
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:136
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:141
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:233
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Error
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1888
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:428
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:201
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:216
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:85
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:49
MachineInstr * top() const
Definition: SIInstrInfo.h:54
bool empty() const
Definition: SIInstrInfo.h:64
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:73
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.