LLVM 19.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm::AMDGPU {
44#define GET_D16ImageDimIntrinsics_IMPL
45#define GET_ImageDimIntrinsicTable_IMPL
46#define GET_RsrcIntrinsics_IMPL
47#include "AMDGPUGenSearchableTables.inc"
48} // namespace llvm::AMDGPU
49
50// Must be at least 4 to be able to branch over minimum unconditional branch
51// code. This is only for making it possible to write reasonably small tests for
52// long branches.
54BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
55 cl::desc("Restrict range of branch instructions (DEBUG)"));
56
58 "amdgpu-fix-16-bit-physreg-copies",
59 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
60 cl::init(true),
62
64 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
65 RI(ST), ST(ST) {
66 SchedModel.init(&ST);
67}
68
69//===----------------------------------------------------------------------===//
70// TargetInstrInfo callbacks
71//===----------------------------------------------------------------------===//
72
73static unsigned getNumOperandsNoGlue(SDNode *Node) {
74 unsigned N = Node->getNumOperands();
75 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
76 --N;
77 return N;
78}
79
80/// Returns true if both nodes have the same value for the given
81/// operand \p Op, or if both nodes do not have this operand.
82static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
83 unsigned Opc0 = N0->getMachineOpcode();
84 unsigned Opc1 = N1->getMachineOpcode();
85
86 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
87 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
88
89 if (Op0Idx == -1 && Op1Idx == -1)
90 return true;
91
92
93 if ((Op0Idx == -1 && Op1Idx != -1) ||
94 (Op1Idx == -1 && Op0Idx != -1))
95 return false;
96
97 // getNamedOperandIdx returns the index for the MachineInstr's operands,
98 // which includes the result as the first operand. We are indexing into the
99 // MachineSDNode's operands, so we need to skip the result operand to get
100 // the real index.
101 --Op0Idx;
102 --Op1Idx;
103
104 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
105}
106
107static bool canRemat(const MachineInstr &MI) {
108
112 return true;
113
114 if (SIInstrInfo::isSMRD(MI)) {
115 return !MI.memoperands_empty() &&
116 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
117 return MMO->isLoad() && MMO->isInvariant();
118 });
119 }
120
121 return false;
122}
123
125 const MachineInstr &MI) const {
126
127 if (canRemat(MI)) {
128 // Normally VALU use of exec would block the rematerialization, but that
129 // is OK in this case to have an implicit exec read as all VALU do.
130 // We really want all of the generic logic for this except for this.
131
132 // Another potential implicit use is mode register. The core logic of
133 // the RA will not attempt rematerialization if mode is set anywhere
134 // in the function, otherwise it is safe since mode is not changed.
135
136 // There is difference to generic method which does not allow
137 // rematerialization if there are virtual register uses. We allow this,
138 // therefore this method includes SOP instructions as well.
139 if (!MI.hasImplicitDef() &&
140 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
141 !MI.mayRaiseFPException())
142 return true;
143 }
144
146}
147
148// Returns true if the scalar result of a VALU instruction depends on exec.
150 // Ignore comparisons which are only used masked with exec.
151 // This allows some hoisting/sinking of VALU comparisons.
152 if (MI.isCompare()) {
153 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
154 Register DstReg = MI.getOperand(0).getReg();
155 if (!DstReg.isVirtual())
156 return true;
157 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
158 switch (Use.getOpcode()) {
159 case AMDGPU::S_AND_SAVEEXEC_B32:
160 case AMDGPU::S_AND_SAVEEXEC_B64:
161 break;
162 case AMDGPU::S_AND_B32:
163 case AMDGPU::S_AND_B64:
164 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
165 return true;
166 break;
167 default:
168 return true;
169 }
170 }
171 return false;
172 }
173
174 switch (MI.getOpcode()) {
175 default:
176 break;
177 case AMDGPU::V_READFIRSTLANE_B32:
178 return true;
179 }
180
181 return false;
182}
183
185 // Any implicit use of exec by VALU is not a real register read.
186 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
188}
189
191 MachineBasicBlock *SuccToSinkTo,
192 MachineCycleInfo *CI) const {
193 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
194 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
195 return true;
196
197 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
198 // Check if sinking of MI would create temporal divergent use.
199 for (auto Op : MI.uses()) {
200 if (Op.isReg() && Op.getReg().isVirtual() &&
201 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
202 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
203
204 // SgprDef defined inside cycle
205 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
206 if (FromCycle == nullptr)
207 continue;
208
209 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
210 // Check if there is a FromCycle that contains SgprDef's basic block but
211 // does not contain SuccToSinkTo and also has divergent exit condition.
212 while (FromCycle && !FromCycle->contains(ToCycle)) {
214 FromCycle->getExitingBlocks(ExitingBlocks);
215
216 // FromCycle has divergent exit condition.
217 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
218 if (hasDivergentBranch(ExitingBlock))
219 return false;
220 }
221
222 FromCycle = FromCycle->getParentCycle();
223 }
224 }
225 }
226
227 return true;
228}
229
231 int64_t &Offset0,
232 int64_t &Offset1) const {
233 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
234 return false;
235
236 unsigned Opc0 = Load0->getMachineOpcode();
237 unsigned Opc1 = Load1->getMachineOpcode();
238
239 // Make sure both are actually loads.
240 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
241 return false;
242
243 // A mayLoad instruction without a def is not a load. Likely a prefetch.
244 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
245 return false;
246
247 if (isDS(Opc0) && isDS(Opc1)) {
248
249 // FIXME: Handle this case:
250 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
251 return false;
252
253 // Check base reg.
254 if (Load0->getOperand(0) != Load1->getOperand(0))
255 return false;
256
257 // Skip read2 / write2 variants for simplicity.
258 // TODO: We should report true if the used offsets are adjacent (excluded
259 // st64 versions).
260 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
261 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
262 if (Offset0Idx == -1 || Offset1Idx == -1)
263 return false;
264
265 // XXX - be careful of dataless loads
266 // getNamedOperandIdx returns the index for MachineInstrs. Since they
267 // include the output in the operand list, but SDNodes don't, we need to
268 // subtract the index by one.
269 Offset0Idx -= get(Opc0).NumDefs;
270 Offset1Idx -= get(Opc1).NumDefs;
271 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
272 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
273 return true;
274 }
275
276 if (isSMRD(Opc0) && isSMRD(Opc1)) {
277 // Skip time and cache invalidation instructions.
278 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
279 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
280 return false;
281
282 unsigned NumOps = getNumOperandsNoGlue(Load0);
283 if (NumOps != getNumOperandsNoGlue(Load1))
284 return false;
285
286 // Check base reg.
287 if (Load0->getOperand(0) != Load1->getOperand(0))
288 return false;
289
290 // Match register offsets, if both register and immediate offsets present.
291 assert(NumOps == 4 || NumOps == 5);
292 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
293 return false;
294
295 const ConstantSDNode *Load0Offset =
296 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
297 const ConstantSDNode *Load1Offset =
298 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
299
300 if (!Load0Offset || !Load1Offset)
301 return false;
302
303 Offset0 = Load0Offset->getZExtValue();
304 Offset1 = Load1Offset->getZExtValue();
305 return true;
306 }
307
308 // MUBUF and MTBUF can access the same addresses.
309 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
310
311 // MUBUF and MTBUF have vaddr at different indices.
312 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
313 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
314 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
315 return false;
316
317 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
318 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
319
320 if (OffIdx0 == -1 || OffIdx1 == -1)
321 return false;
322
323 // getNamedOperandIdx returns the index for MachineInstrs. Since they
324 // include the output in the operand list, but SDNodes don't, we need to
325 // subtract the index by one.
326 OffIdx0 -= get(Opc0).NumDefs;
327 OffIdx1 -= get(Opc1).NumDefs;
328
329 SDValue Off0 = Load0->getOperand(OffIdx0);
330 SDValue Off1 = Load1->getOperand(OffIdx1);
331
332 // The offset might be a FrameIndexSDNode.
333 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
334 return false;
335
336 Offset0 = Off0->getAsZExtVal();
337 Offset1 = Off1->getAsZExtVal();
338 return true;
339 }
340
341 return false;
342}
343
344static bool isStride64(unsigned Opc) {
345 switch (Opc) {
346 case AMDGPU::DS_READ2ST64_B32:
347 case AMDGPU::DS_READ2ST64_B64:
348 case AMDGPU::DS_WRITE2ST64_B32:
349 case AMDGPU::DS_WRITE2ST64_B64:
350 return true;
351 default:
352 return false;
353 }
354}
355
358 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
359 const TargetRegisterInfo *TRI) const {
360 if (!LdSt.mayLoadOrStore())
361 return false;
362
363 unsigned Opc = LdSt.getOpcode();
364 OffsetIsScalable = false;
365 const MachineOperand *BaseOp, *OffsetOp;
366 int DataOpIdx;
367
368 if (isDS(LdSt)) {
369 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
370 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
371 if (OffsetOp) {
372 // Normal, single offset LDS instruction.
373 if (!BaseOp) {
374 // DS_CONSUME/DS_APPEND use M0 for the base address.
375 // TODO: find the implicit use operand for M0 and use that as BaseOp?
376 return false;
377 }
378 BaseOps.push_back(BaseOp);
379 Offset = OffsetOp->getImm();
380 // Get appropriate operand, and compute width accordingly.
381 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
382 if (DataOpIdx == -1)
383 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
384 Width = getOpSize(LdSt, DataOpIdx);
385 } else {
386 // The 2 offset instructions use offset0 and offset1 instead. We can treat
387 // these as a load with a single offset if the 2 offsets are consecutive.
388 // We will use this for some partially aligned loads.
389 const MachineOperand *Offset0Op =
390 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
391 const MachineOperand *Offset1Op =
392 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
393
394 unsigned Offset0 = Offset0Op->getImm() & 0xff;
395 unsigned Offset1 = Offset1Op->getImm() & 0xff;
396 if (Offset0 + 1 != Offset1)
397 return false;
398
399 // Each of these offsets is in element sized units, so we need to convert
400 // to bytes of the individual reads.
401
402 unsigned EltSize;
403 if (LdSt.mayLoad())
404 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
405 else {
406 assert(LdSt.mayStore());
407 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
408 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
409 }
410
411 if (isStride64(Opc))
412 EltSize *= 64;
413
414 BaseOps.push_back(BaseOp);
415 Offset = EltSize * Offset0;
416 // Get appropriate operand(s), and compute width accordingly.
417 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
418 if (DataOpIdx == -1) {
419 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 Width = getOpSize(LdSt, DataOpIdx);
421 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
422 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
423 } else {
424 Width = getOpSize(LdSt, DataOpIdx);
425 }
426 }
427 return true;
428 }
429
430 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
431 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
432 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
433 return false;
434 BaseOps.push_back(RSrc);
435 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
436 if (BaseOp && !BaseOp->isFI())
437 BaseOps.push_back(BaseOp);
438 const MachineOperand *OffsetImm =
439 getNamedOperand(LdSt, AMDGPU::OpName::offset);
440 Offset = OffsetImm->getImm();
441 const MachineOperand *SOffset =
442 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
443 if (SOffset) {
444 if (SOffset->isReg())
445 BaseOps.push_back(SOffset);
446 else
447 Offset += SOffset->getImm();
448 }
449 // Get appropriate operand, and compute width accordingly.
450 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
451 if (DataOpIdx == -1)
452 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
453 if (DataOpIdx == -1) // LDS DMA
454 return false;
455 Width = getOpSize(LdSt, DataOpIdx);
456 return true;
457 }
458
459 if (isImage(LdSt)) {
460 auto RsrcOpName =
461 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
462 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
463 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
464 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
465 if (VAddr0Idx >= 0) {
466 // GFX10 possible NSA encoding.
467 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
468 BaseOps.push_back(&LdSt.getOperand(I));
469 } else {
470 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
471 }
472 Offset = 0;
473 // Get appropriate operand, and compute width accordingly.
474 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
475 Width = getOpSize(LdSt, DataOpIdx);
476 return true;
477 }
478
479 if (isSMRD(LdSt)) {
480 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
481 if (!BaseOp) // e.g. S_MEMTIME
482 return false;
483 BaseOps.push_back(BaseOp);
484 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
485 Offset = OffsetOp ? OffsetOp->getImm() : 0;
486 // Get appropriate operand, and compute width accordingly.
487 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
488 if (DataOpIdx == -1)
489 return false;
490 Width = getOpSize(LdSt, DataOpIdx);
491 return true;
492 }
493
494 if (isFLAT(LdSt)) {
495 // Instructions have either vaddr or saddr or both or none.
496 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
497 if (BaseOp)
498 BaseOps.push_back(BaseOp);
499 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
500 if (BaseOp)
501 BaseOps.push_back(BaseOp);
502 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
503 // Get appropriate operand, and compute width accordingly.
504 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
505 if (DataOpIdx == -1)
506 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
507 if (DataOpIdx == -1) // LDS DMA
508 return false;
509 Width = getOpSize(LdSt, DataOpIdx);
510 return true;
511 }
512
513 return false;
514}
515
516static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
518 const MachineInstr &MI2,
520 // Only examine the first "base" operand of each instruction, on the
521 // assumption that it represents the real base address of the memory access.
522 // Other operands are typically offsets or indices from this base address.
523 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
524 return true;
525
526 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
527 return false;
528
529 auto MO1 = *MI1.memoperands_begin();
530 auto MO2 = *MI2.memoperands_begin();
531 if (MO1->getAddrSpace() != MO2->getAddrSpace())
532 return false;
533
534 auto Base1 = MO1->getValue();
535 auto Base2 = MO2->getValue();
536 if (!Base1 || !Base2)
537 return false;
538 Base1 = getUnderlyingObject(Base1);
539 Base2 = getUnderlyingObject(Base2);
540
541 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
542 return false;
543
544 return Base1 == Base2;
545}
546
548 int64_t Offset1, bool OffsetIsScalable1,
550 int64_t Offset2, bool OffsetIsScalable2,
551 unsigned ClusterSize,
552 unsigned NumBytes) const {
553 // If the mem ops (to be clustered) do not have the same base ptr, then they
554 // should not be clustered
555 if (!BaseOps1.empty() && !BaseOps2.empty()) {
556 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
557 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
558 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
559 return false;
560 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
561 // If only one base op is empty, they do not have the same base ptr
562 return false;
563 }
564
565 // In order to avoid register pressure, on an average, the number of DWORDS
566 // loaded together by all clustered mem ops should not exceed 8. This is an
567 // empirical value based on certain observations and performance related
568 // experiments.
569 // The good thing about this heuristic is - it avoids clustering of too many
570 // sub-word loads, and also avoids clustering of wide loads. Below is the
571 // brief summary of how the heuristic behaves for various `LoadSize`.
572 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
573 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
574 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
575 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
576 // (5) LoadSize >= 17: do not cluster
577 const unsigned LoadSize = NumBytes / ClusterSize;
578 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
579 return NumDWORDs <= 8;
580}
581
582// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
583// the first 16 loads will be interleaved with the stores, and the next 16 will
584// be clustered as expected. It should really split into 2 16 store batches.
585//
586// Loads are clustered until this returns false, rather than trying to schedule
587// groups of stores. This also means we have to deal with saying different
588// address space loads should be clustered, and ones which might cause bank
589// conflicts.
590//
591// This might be deprecated so it might not be worth that much effort to fix.
593 int64_t Offset0, int64_t Offset1,
594 unsigned NumLoads) const {
595 assert(Offset1 > Offset0 &&
596 "Second offset should be larger than first offset!");
597 // If we have less than 16 loads in a row, and the offsets are within 64
598 // bytes, then schedule together.
599
600 // A cacheline is 64 bytes (for global memory).
601 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
602}
603
606 const DebugLoc &DL, MCRegister DestReg,
607 MCRegister SrcReg, bool KillSrc,
608 const char *Msg = "illegal VGPR to SGPR copy") {
610 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
612 C.diagnose(IllegalCopy);
613
614 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
615 .addReg(SrcReg, getKillRegState(KillSrc));
616}
617
618/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
619/// possible to have a direct copy in these cases on GFX908, so an intermediate
620/// VGPR copy is required.
624 const DebugLoc &DL, MCRegister DestReg,
625 MCRegister SrcReg, bool KillSrc,
626 RegScavenger &RS, bool RegsOverlap,
627 Register ImpDefSuperReg = Register(),
628 Register ImpUseSuperReg = Register()) {
629 assert((TII.getSubtarget().hasMAIInsts() &&
630 !TII.getSubtarget().hasGFX90AInsts()) &&
631 "Expected GFX908 subtarget.");
632
633 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
634 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
635 "Source register of the copy should be either an SGPR or an AGPR.");
636
637 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
638 "Destination register of the copy should be an AGPR.");
639
640 const SIRegisterInfo &RI = TII.getRegisterInfo();
641
642 // First try to find defining accvgpr_write to avoid temporary registers.
643 // In the case of copies of overlapping AGPRs, we conservatively do not
644 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
645 // an accvgpr_write used for this same copy due to implicit-defs
646 if (!RegsOverlap) {
647 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
648 --Def;
649
650 if (!Def->modifiesRegister(SrcReg, &RI))
651 continue;
652
653 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
654 Def->getOperand(0).getReg() != SrcReg)
655 break;
656
657 MachineOperand &DefOp = Def->getOperand(1);
658 assert(DefOp.isReg() || DefOp.isImm());
659
660 if (DefOp.isReg()) {
661 bool SafeToPropagate = true;
662 // Check that register source operand is not clobbered before MI.
663 // Immediate operands are always safe to propagate.
664 for (auto I = Def; I != MI && SafeToPropagate; ++I)
665 if (I->modifiesRegister(DefOp.getReg(), &RI))
666 SafeToPropagate = false;
667
668 if (!SafeToPropagate)
669 break;
670
671 DefOp.setIsKill(false);
672 }
673
674 MachineInstrBuilder Builder =
675 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
676 .add(DefOp);
677 if (ImpDefSuperReg)
678 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
679
680 if (ImpUseSuperReg) {
681 Builder.addReg(ImpUseSuperReg,
683 }
684
685 return;
686 }
687 }
688
690 RS.backward(std::next(MI));
691
692 // Ideally we want to have three registers for a long reg_sequence copy
693 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
694 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
695 *MBB.getParent());
696
697 // Registers in the sequence are allocated contiguously so we can just
698 // use register number to pick one of three round-robin temps.
699 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
700 Register Tmp =
701 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
703 "VGPR used for an intermediate copy should have been reserved.");
704
705 // Only loop through if there are any free registers left. We don't want to
706 // spill.
707 while (RegNo--) {
708 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
709 /* RestoreAfter */ false, 0,
710 /* AllowSpill */ false);
711 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
712 break;
713 Tmp = Tmp2;
714 RS.setRegUsed(Tmp);
715 }
716
717 // Insert copy to temporary VGPR.
718 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
719 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
720 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
721 } else {
722 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
723 }
724
725 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
726 .addReg(SrcReg, getKillRegState(KillSrc));
727 if (ImpUseSuperReg) {
728 UseBuilder.addReg(ImpUseSuperReg,
730 }
731
732 MachineInstrBuilder DefBuilder
733 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
734 .addReg(Tmp, RegState::Kill);
735
736 if (ImpDefSuperReg)
737 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
738}
739
742 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
743 const TargetRegisterClass *RC, bool Forward) {
744 const SIRegisterInfo &RI = TII.getRegisterInfo();
745 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
747 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
748
749 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
750 int16_t SubIdx = BaseIndices[Idx];
751 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
752 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
753 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
754 unsigned Opcode = AMDGPU::S_MOV_B32;
755
756 // Is SGPR aligned? If so try to combine with next.
757 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
758 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
759 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
760 // Can use SGPR64 copy
761 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
762 SubIdx = RI.getSubRegFromChannel(Channel, 2);
763 DestSubReg = RI.getSubReg(DestReg, SubIdx);
764 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
765 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
766 Opcode = AMDGPU::S_MOV_B64;
767 Idx++;
768 }
769
770 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
771 .addReg(SrcSubReg)
772 .addReg(SrcReg, RegState::Implicit);
773
774 if (!FirstMI)
775 FirstMI = LastMI;
776
777 if (!Forward)
778 I--;
779 }
780
781 assert(FirstMI && LastMI);
782 if (!Forward)
783 std::swap(FirstMI, LastMI);
784
785 FirstMI->addOperand(
786 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
787
788 if (KillSrc)
789 LastMI->addRegisterKilled(SrcReg, &RI);
790}
791
794 const DebugLoc &DL, MCRegister DestReg,
795 MCRegister SrcReg, bool KillSrc) const {
796 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
797 unsigned Size = RI.getRegSizeInBits(*RC);
798 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
799 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
800
801 // The rest of copyPhysReg assumes Src and Dst size are the same size.
802 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
803 // we remove Fix16BitCopies and this code block?
804 if (Fix16BitCopies) {
805 if (((Size == 16) != (SrcSize == 16))) {
806 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
808 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
809 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
810 RegToFix = SubReg;
811
812 if (DestReg == SrcReg) {
813 // Identity copy. Insert empty bundle since ExpandPostRA expects an
814 // instruction here.
815 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
816 return;
817 }
818 RC = RI.getPhysRegBaseClass(DestReg);
819 Size = RI.getRegSizeInBits(*RC);
820 SrcRC = RI.getPhysRegBaseClass(SrcReg);
821 SrcSize = RI.getRegSizeInBits(*SrcRC);
822 }
823 }
824
825 if (RC == &AMDGPU::VGPR_32RegClass) {
826 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
827 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
828 AMDGPU::AGPR_32RegClass.contains(SrcReg));
829 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
830 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
831 BuildMI(MBB, MI, DL, get(Opc), DestReg)
832 .addReg(SrcReg, getKillRegState(KillSrc));
833 return;
834 }
835
836 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
837 RC == &AMDGPU::SReg_32RegClass) {
838 if (SrcReg == AMDGPU::SCC) {
839 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
840 .addImm(1)
841 .addImm(0);
842 return;
843 }
844
845 if (DestReg == AMDGPU::VCC_LO) {
846 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
847 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
848 .addReg(SrcReg, getKillRegState(KillSrc));
849 } else {
850 // FIXME: Hack until VReg_1 removed.
851 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
852 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
853 .addImm(0)
854 .addReg(SrcReg, getKillRegState(KillSrc));
855 }
856
857 return;
858 }
859
860 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
861 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
862 return;
863 }
864
865 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
866 .addReg(SrcReg, getKillRegState(KillSrc));
867 return;
868 }
869
870 if (RC == &AMDGPU::SReg_64RegClass) {
871 if (SrcReg == AMDGPU::SCC) {
872 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
873 .addImm(1)
874 .addImm(0);
875 return;
876 }
877
878 if (DestReg == AMDGPU::VCC) {
879 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
880 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
881 .addReg(SrcReg, getKillRegState(KillSrc));
882 } else {
883 // FIXME: Hack until VReg_1 removed.
884 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
885 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
886 .addImm(0)
887 .addReg(SrcReg, getKillRegState(KillSrc));
888 }
889
890 return;
891 }
892
893 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
894 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
895 return;
896 }
897
898 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
899 .addReg(SrcReg, getKillRegState(KillSrc));
900 return;
901 }
902
903 if (DestReg == AMDGPU::SCC) {
904 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
905 // but SelectionDAG emits such copies for i1 sources.
906 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
907 // This copy can only be produced by patterns
908 // with explicit SCC, which are known to be enabled
909 // only for subtargets with S_CMP_LG_U64 present.
911 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
912 .addReg(SrcReg, getKillRegState(KillSrc))
913 .addImm(0);
914 } else {
915 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
916 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
917 .addReg(SrcReg, getKillRegState(KillSrc))
918 .addImm(0);
919 }
920
921 return;
922 }
923
924 if (RC == &AMDGPU::AGPR_32RegClass) {
925 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
926 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
927 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
928 .addReg(SrcReg, getKillRegState(KillSrc));
929 return;
930 }
931
932 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
933 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
934 .addReg(SrcReg, getKillRegState(KillSrc));
935 return;
936 }
937
938 // FIXME: Pass should maintain scavenger to avoid scan through the block on
939 // every AGPR spill.
940 RegScavenger RS;
941 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
942 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
943 return;
944 }
945
946 if (Size == 16) {
947 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
948 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
949 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
950
951 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
952 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
953 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
954 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
955 bool DstLow = !AMDGPU::isHi(DestReg, RI);
956 bool SrcLow = !AMDGPU::isHi(SrcReg, RI);
957 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
958 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
959
960 if (IsSGPRDst) {
961 if (!IsSGPRSrc) {
962 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
963 return;
964 }
965
966 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
967 .addReg(NewSrcReg, getKillRegState(KillSrc));
968 return;
969 }
970
971 if (IsAGPRDst || IsAGPRSrc) {
972 if (!DstLow || !SrcLow) {
973 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
974 "Cannot use hi16 subreg with an AGPR!");
975 }
976
977 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
978 return;
979 }
980
981 if (ST.hasTrue16BitInsts()) {
982 if (IsSGPRSrc) {
983 assert(SrcLow);
984 SrcReg = NewSrcReg;
985 }
986 // Use the smaller instruction encoding if possible.
987 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
988 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
989 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
990 .addReg(SrcReg);
991 } else {
992 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
993 .addImm(0) // src0_modifiers
994 .addReg(SrcReg)
995 .addImm(0); // op_sel
996 }
997 return;
998 }
999
1000 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1001 if (!DstLow || !SrcLow) {
1002 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1003 "Cannot use hi16 subreg on VI!");
1004 }
1005
1006 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1007 .addReg(NewSrcReg, getKillRegState(KillSrc));
1008 return;
1009 }
1010
1011 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1012 .addImm(0) // src0_modifiers
1013 .addReg(NewSrcReg)
1014 .addImm(0) // clamp
1021 // First implicit operand is $exec.
1022 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1023 return;
1024 }
1025
1026 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1027 if (ST.hasMovB64()) {
1028 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1029 .addReg(SrcReg, getKillRegState(KillSrc));
1030 return;
1031 }
1032 if (ST.hasPkMovB32()) {
1033 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1035 .addReg(SrcReg)
1037 .addReg(SrcReg)
1038 .addImm(0) // op_sel_lo
1039 .addImm(0) // op_sel_hi
1040 .addImm(0) // neg_lo
1041 .addImm(0) // neg_hi
1042 .addImm(0) // clamp
1043 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1044 return;
1045 }
1046 }
1047
1048 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1049 if (RI.isSGPRClass(RC)) {
1050 if (!RI.isSGPRClass(SrcRC)) {
1051 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1052 return;
1053 }
1054 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1055 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1056 Forward);
1057 return;
1058 }
1059
1060 unsigned EltSize = 4;
1061 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1062 if (RI.isAGPRClass(RC)) {
1063 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1064 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1065 else if (RI.hasVGPRs(SrcRC) ||
1066 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1067 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1068 else
1069 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1070 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1071 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1072 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1073 (RI.isProperlyAlignedRC(*RC) &&
1074 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1075 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1076 if (ST.hasMovB64()) {
1077 Opcode = AMDGPU::V_MOV_B64_e32;
1078 EltSize = 8;
1079 } else if (ST.hasPkMovB32()) {
1080 Opcode = AMDGPU::V_PK_MOV_B32;
1081 EltSize = 8;
1082 }
1083 }
1084
1085 // For the cases where we need an intermediate instruction/temporary register
1086 // (destination is an AGPR), we need a scavenger.
1087 //
1088 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1089 // whole block for every handled copy.
1090 std::unique_ptr<RegScavenger> RS;
1091 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1092 RS = std::make_unique<RegScavenger>();
1093
1094 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1095
1096 // If there is an overlap, we can't kill the super-register on the last
1097 // instruction, since it will also kill the components made live by this def.
1098 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1099 const bool CanKillSuperReg = KillSrc && !Overlap;
1100
1101 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1102 unsigned SubIdx;
1103 if (Forward)
1104 SubIdx = SubIndices[Idx];
1105 else
1106 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1107 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1108 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1109 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1110
1111 bool IsFirstSubreg = Idx == 0;
1112 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1113
1114 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1115 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1116 Register ImpUseSuper = SrcReg;
1117 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1118 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1119 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1121 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1123 .addReg(SrcSubReg)
1125 .addReg(SrcSubReg)
1126 .addImm(0) // op_sel_lo
1127 .addImm(0) // op_sel_hi
1128 .addImm(0) // neg_lo
1129 .addImm(0) // neg_hi
1130 .addImm(0) // clamp
1131 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1132 if (IsFirstSubreg)
1134 } else {
1135 MachineInstrBuilder Builder =
1136 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1137 if (IsFirstSubreg)
1138 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1139
1140 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1141 }
1142 }
1143}
1144
1145int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1146 int NewOpc;
1147
1148 // Try to map original to commuted opcode
1149 NewOpc = AMDGPU::getCommuteRev(Opcode);
1150 if (NewOpc != -1)
1151 // Check if the commuted (REV) opcode exists on the target.
1152 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1153
1154 // Try to map commuted to original opcode
1155 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1156 if (NewOpc != -1)
1157 // Check if the original (non-REV) opcode exists on the target.
1158 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1159
1160 return Opcode;
1161}
1162
1165 const DebugLoc &DL, Register DestReg,
1166 int64_t Value) const {
1168 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1169 if (RegClass == &AMDGPU::SReg_32RegClass ||
1170 RegClass == &AMDGPU::SGPR_32RegClass ||
1171 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1172 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1173 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1174 .addImm(Value);
1175 return;
1176 }
1177
1178 if (RegClass == &AMDGPU::SReg_64RegClass ||
1179 RegClass == &AMDGPU::SGPR_64RegClass ||
1180 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1181 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1182 .addImm(Value);
1183 return;
1184 }
1185
1186 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1187 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1188 .addImm(Value);
1189 return;
1190 }
1191 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1192 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1193 .addImm(Value);
1194 return;
1195 }
1196
1197 unsigned EltSize = 4;
1198 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1199 if (RI.isSGPRClass(RegClass)) {
1200 if (RI.getRegSizeInBits(*RegClass) > 32) {
1201 Opcode = AMDGPU::S_MOV_B64;
1202 EltSize = 8;
1203 } else {
1204 Opcode = AMDGPU::S_MOV_B32;
1205 EltSize = 4;
1206 }
1207 }
1208
1209 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1210 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1211 int64_t IdxValue = Idx == 0 ? Value : 0;
1212
1213 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1214 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1215 Builder.addImm(IdxValue);
1216 }
1217}
1218
1219const TargetRegisterClass *
1221 return &AMDGPU::VGPR_32RegClass;
1222}
1223
1226 const DebugLoc &DL, Register DstReg,
1228 Register TrueReg,
1229 Register FalseReg) const {
1231 const TargetRegisterClass *BoolXExecRC =
1232 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1233 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1234 "Not a VGPR32 reg");
1235
1236 if (Cond.size() == 1) {
1237 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1238 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1239 .add(Cond[0]);
1240 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1241 .addImm(0)
1242 .addReg(FalseReg)
1243 .addImm(0)
1244 .addReg(TrueReg)
1245 .addReg(SReg);
1246 } else if (Cond.size() == 2) {
1247 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1248 switch (Cond[0].getImm()) {
1249 case SIInstrInfo::SCC_TRUE: {
1250 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1251 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1252 : AMDGPU::S_CSELECT_B64), SReg)
1253 .addImm(1)
1254 .addImm(0);
1255 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1256 .addImm(0)
1257 .addReg(FalseReg)
1258 .addImm(0)
1259 .addReg(TrueReg)
1260 .addReg(SReg);
1261 break;
1262 }
1263 case SIInstrInfo::SCC_FALSE: {
1264 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1265 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1266 : AMDGPU::S_CSELECT_B64), SReg)
1267 .addImm(0)
1268 .addImm(1);
1269 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1270 .addImm(0)
1271 .addReg(FalseReg)
1272 .addImm(0)
1273 .addReg(TrueReg)
1274 .addReg(SReg);
1275 break;
1276 }
1277 case SIInstrInfo::VCCNZ: {
1278 MachineOperand RegOp = Cond[1];
1279 RegOp.setImplicit(false);
1280 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1281 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1282 .add(RegOp);
1283 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1284 .addImm(0)
1285 .addReg(FalseReg)
1286 .addImm(0)
1287 .addReg(TrueReg)
1288 .addReg(SReg);
1289 break;
1290 }
1291 case SIInstrInfo::VCCZ: {
1292 MachineOperand RegOp = Cond[1];
1293 RegOp.setImplicit(false);
1294 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1295 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1296 .add(RegOp);
1297 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1298 .addImm(0)
1299 .addReg(TrueReg)
1300 .addImm(0)
1301 .addReg(FalseReg)
1302 .addReg(SReg);
1303 break;
1304 }
1305 case SIInstrInfo::EXECNZ: {
1306 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1307 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1308 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1309 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1310 .addImm(0);
1311 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1312 : AMDGPU::S_CSELECT_B64), SReg)
1313 .addImm(1)
1314 .addImm(0);
1315 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1316 .addImm(0)
1317 .addReg(FalseReg)
1318 .addImm(0)
1319 .addReg(TrueReg)
1320 .addReg(SReg);
1321 break;
1322 }
1323 case SIInstrInfo::EXECZ: {
1324 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1325 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1326 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1327 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1328 .addImm(0);
1329 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1330 : AMDGPU::S_CSELECT_B64), SReg)
1331 .addImm(0)
1332 .addImm(1);
1333 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1334 .addImm(0)
1335 .addReg(FalseReg)
1336 .addImm(0)
1337 .addReg(TrueReg)
1338 .addReg(SReg);
1339 llvm_unreachable("Unhandled branch predicate EXECZ");
1340 break;
1341 }
1342 default:
1343 llvm_unreachable("invalid branch predicate");
1344 }
1345 } else {
1346 llvm_unreachable("Can only handle Cond size 1 or 2");
1347 }
1348}
1349
1352 const DebugLoc &DL,
1353 Register SrcReg, int Value) const {
1355 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1356 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1357 .addImm(Value)
1358 .addReg(SrcReg);
1359
1360 return Reg;
1361}
1362
1365 const DebugLoc &DL,
1366 Register SrcReg, int Value) const {
1368 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1369 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1370 .addImm(Value)
1371 .addReg(SrcReg);
1372
1373 return Reg;
1374}
1375
1377
1378 if (RI.isAGPRClass(DstRC))
1379 return AMDGPU::COPY;
1380 if (RI.getRegSizeInBits(*DstRC) == 16) {
1381 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1382 // before RA.
1383 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1384 }
1385 if (RI.getRegSizeInBits(*DstRC) == 32)
1386 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1387 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1388 return AMDGPU::S_MOV_B64;
1389 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1390 return AMDGPU::V_MOV_B64_PSEUDO;
1391 return AMDGPU::COPY;
1392}
1393
1394const MCInstrDesc &
1396 bool IsIndirectSrc) const {
1397 if (IsIndirectSrc) {
1398 if (VecSize <= 32) // 4 bytes
1399 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1400 if (VecSize <= 64) // 8 bytes
1401 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1402 if (VecSize <= 96) // 12 bytes
1403 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1404 if (VecSize <= 128) // 16 bytes
1405 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1406 if (VecSize <= 160) // 20 bytes
1407 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1408 if (VecSize <= 256) // 32 bytes
1409 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1410 if (VecSize <= 288) // 36 bytes
1411 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1412 if (VecSize <= 320) // 40 bytes
1413 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1414 if (VecSize <= 352) // 44 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1416 if (VecSize <= 384) // 48 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1418 if (VecSize <= 512) // 64 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1420 if (VecSize <= 1024) // 128 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1422
1423 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1424 }
1425
1426 if (VecSize <= 32) // 4 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1428 if (VecSize <= 64) // 8 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1430 if (VecSize <= 96) // 12 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1432 if (VecSize <= 128) // 16 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1434 if (VecSize <= 160) // 20 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1436 if (VecSize <= 256) // 32 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1438 if (VecSize <= 288) // 36 bytes
1439 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1440 if (VecSize <= 320) // 40 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1442 if (VecSize <= 352) // 44 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1444 if (VecSize <= 384) // 48 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1446 if (VecSize <= 512) // 64 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1448 if (VecSize <= 1024) // 128 bytes
1449 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1450
1451 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1452}
1453
1454static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1455 if (VecSize <= 32) // 4 bytes
1456 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1457 if (VecSize <= 64) // 8 bytes
1458 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1459 if (VecSize <= 96) // 12 bytes
1460 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1461 if (VecSize <= 128) // 16 bytes
1462 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1463 if (VecSize <= 160) // 20 bytes
1464 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1465 if (VecSize <= 256) // 32 bytes
1466 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1467 if (VecSize <= 288) // 36 bytes
1468 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1469 if (VecSize <= 320) // 40 bytes
1470 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1471 if (VecSize <= 352) // 44 bytes
1472 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1473 if (VecSize <= 384) // 48 bytes
1474 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1475 if (VecSize <= 512) // 64 bytes
1476 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1477 if (VecSize <= 1024) // 128 bytes
1478 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1479
1480 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1481}
1482
1483static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1484 if (VecSize <= 32) // 4 bytes
1485 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1486 if (VecSize <= 64) // 8 bytes
1487 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1488 if (VecSize <= 96) // 12 bytes
1489 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1490 if (VecSize <= 128) // 16 bytes
1491 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1492 if (VecSize <= 160) // 20 bytes
1493 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1494 if (VecSize <= 256) // 32 bytes
1495 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1496 if (VecSize <= 288) // 36 bytes
1497 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1498 if (VecSize <= 320) // 40 bytes
1499 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1500 if (VecSize <= 352) // 44 bytes
1501 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1502 if (VecSize <= 384) // 48 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1504 if (VecSize <= 512) // 64 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1506 if (VecSize <= 1024) // 128 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1508
1509 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1510}
1511
1512static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1513 if (VecSize <= 64) // 8 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1515 if (VecSize <= 128) // 16 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1517 if (VecSize <= 256) // 32 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1519 if (VecSize <= 512) // 64 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1521 if (VecSize <= 1024) // 128 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1523
1524 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1525}
1526
1527const MCInstrDesc &
1528SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1529 bool IsSGPR) const {
1530 if (IsSGPR) {
1531 switch (EltSize) {
1532 case 32:
1533 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1534 case 64:
1535 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1536 default:
1537 llvm_unreachable("invalid reg indexing elt size");
1538 }
1539 }
1540
1541 assert(EltSize == 32 && "invalid reg indexing elt size");
1543}
1544
1545static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1546 switch (Size) {
1547 case 4:
1548 return AMDGPU::SI_SPILL_S32_SAVE;
1549 case 8:
1550 return AMDGPU::SI_SPILL_S64_SAVE;
1551 case 12:
1552 return AMDGPU::SI_SPILL_S96_SAVE;
1553 case 16:
1554 return AMDGPU::SI_SPILL_S128_SAVE;
1555 case 20:
1556 return AMDGPU::SI_SPILL_S160_SAVE;
1557 case 24:
1558 return AMDGPU::SI_SPILL_S192_SAVE;
1559 case 28:
1560 return AMDGPU::SI_SPILL_S224_SAVE;
1561 case 32:
1562 return AMDGPU::SI_SPILL_S256_SAVE;
1563 case 36:
1564 return AMDGPU::SI_SPILL_S288_SAVE;
1565 case 40:
1566 return AMDGPU::SI_SPILL_S320_SAVE;
1567 case 44:
1568 return AMDGPU::SI_SPILL_S352_SAVE;
1569 case 48:
1570 return AMDGPU::SI_SPILL_S384_SAVE;
1571 case 64:
1572 return AMDGPU::SI_SPILL_S512_SAVE;
1573 case 128:
1574 return AMDGPU::SI_SPILL_S1024_SAVE;
1575 default:
1576 llvm_unreachable("unknown register size");
1577 }
1578}
1579
1580static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1581 switch (Size) {
1582 case 4:
1583 return AMDGPU::SI_SPILL_V32_SAVE;
1584 case 8:
1585 return AMDGPU::SI_SPILL_V64_SAVE;
1586 case 12:
1587 return AMDGPU::SI_SPILL_V96_SAVE;
1588 case 16:
1589 return AMDGPU::SI_SPILL_V128_SAVE;
1590 case 20:
1591 return AMDGPU::SI_SPILL_V160_SAVE;
1592 case 24:
1593 return AMDGPU::SI_SPILL_V192_SAVE;
1594 case 28:
1595 return AMDGPU::SI_SPILL_V224_SAVE;
1596 case 32:
1597 return AMDGPU::SI_SPILL_V256_SAVE;
1598 case 36:
1599 return AMDGPU::SI_SPILL_V288_SAVE;
1600 case 40:
1601 return AMDGPU::SI_SPILL_V320_SAVE;
1602 case 44:
1603 return AMDGPU::SI_SPILL_V352_SAVE;
1604 case 48:
1605 return AMDGPU::SI_SPILL_V384_SAVE;
1606 case 64:
1607 return AMDGPU::SI_SPILL_V512_SAVE;
1608 case 128:
1609 return AMDGPU::SI_SPILL_V1024_SAVE;
1610 default:
1611 llvm_unreachable("unknown register size");
1612 }
1613}
1614
1615static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1616 switch (Size) {
1617 case 4:
1618 return AMDGPU::SI_SPILL_A32_SAVE;
1619 case 8:
1620 return AMDGPU::SI_SPILL_A64_SAVE;
1621 case 12:
1622 return AMDGPU::SI_SPILL_A96_SAVE;
1623 case 16:
1624 return AMDGPU::SI_SPILL_A128_SAVE;
1625 case 20:
1626 return AMDGPU::SI_SPILL_A160_SAVE;
1627 case 24:
1628 return AMDGPU::SI_SPILL_A192_SAVE;
1629 case 28:
1630 return AMDGPU::SI_SPILL_A224_SAVE;
1631 case 32:
1632 return AMDGPU::SI_SPILL_A256_SAVE;
1633 case 36:
1634 return AMDGPU::SI_SPILL_A288_SAVE;
1635 case 40:
1636 return AMDGPU::SI_SPILL_A320_SAVE;
1637 case 44:
1638 return AMDGPU::SI_SPILL_A352_SAVE;
1639 case 48:
1640 return AMDGPU::SI_SPILL_A384_SAVE;
1641 case 64:
1642 return AMDGPU::SI_SPILL_A512_SAVE;
1643 case 128:
1644 return AMDGPU::SI_SPILL_A1024_SAVE;
1645 default:
1646 llvm_unreachable("unknown register size");
1647 }
1648}
1649
1650static unsigned getAVSpillSaveOpcode(unsigned Size) {
1651 switch (Size) {
1652 case 4:
1653 return AMDGPU::SI_SPILL_AV32_SAVE;
1654 case 8:
1655 return AMDGPU::SI_SPILL_AV64_SAVE;
1656 case 12:
1657 return AMDGPU::SI_SPILL_AV96_SAVE;
1658 case 16:
1659 return AMDGPU::SI_SPILL_AV128_SAVE;
1660 case 20:
1661 return AMDGPU::SI_SPILL_AV160_SAVE;
1662 case 24:
1663 return AMDGPU::SI_SPILL_AV192_SAVE;
1664 case 28:
1665 return AMDGPU::SI_SPILL_AV224_SAVE;
1666 case 32:
1667 return AMDGPU::SI_SPILL_AV256_SAVE;
1668 case 36:
1669 return AMDGPU::SI_SPILL_AV288_SAVE;
1670 case 40:
1671 return AMDGPU::SI_SPILL_AV320_SAVE;
1672 case 44:
1673 return AMDGPU::SI_SPILL_AV352_SAVE;
1674 case 48:
1675 return AMDGPU::SI_SPILL_AV384_SAVE;
1676 case 64:
1677 return AMDGPU::SI_SPILL_AV512_SAVE;
1678 case 128:
1679 return AMDGPU::SI_SPILL_AV1024_SAVE;
1680 default:
1681 llvm_unreachable("unknown register size");
1682 }
1683}
1684
1685static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1686 bool IsVectorSuperClass) {
1687 // Currently, there is only 32-bit WWM register spills needed.
1688 if (Size != 4)
1689 llvm_unreachable("unknown wwm register spill size");
1690
1691 if (IsVectorSuperClass)
1692 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1693
1694 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1695}
1696
1698 const TargetRegisterClass *RC,
1699 unsigned Size,
1700 const SIRegisterInfo &TRI,
1701 const SIMachineFunctionInfo &MFI) {
1702 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1703
1704 // Choose the right opcode if spilling a WWM register.
1706 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1707
1708 if (IsVectorSuperClass)
1709 return getAVSpillSaveOpcode(Size);
1710
1711 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1713}
1714
1717 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1718 const TargetRegisterInfo *TRI, Register VReg) const {
1721 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1722 const DebugLoc &DL = MBB.findDebugLoc(MI);
1723
1724 MachinePointerInfo PtrInfo
1725 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1727 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1728 FrameInfo.getObjectAlign(FrameIndex));
1729 unsigned SpillSize = TRI->getSpillSize(*RC);
1730
1732 if (RI.isSGPRClass(RC)) {
1733 MFI->setHasSpilledSGPRs();
1734 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1735 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1736 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1737
1738 // We are only allowed to create one new instruction when spilling
1739 // registers, so we need to use pseudo instruction for spilling SGPRs.
1740 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1741
1742 // The SGPR spill/restore instructions only work on number sgprs, so we need
1743 // to make sure we are using the correct register class.
1744 if (SrcReg.isVirtual() && SpillSize == 4) {
1745 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1746 }
1747
1748 BuildMI(MBB, MI, DL, OpDesc)
1749 .addReg(SrcReg, getKillRegState(isKill)) // data
1750 .addFrameIndex(FrameIndex) // addr
1751 .addMemOperand(MMO)
1753
1754 if (RI.spillSGPRToVGPR())
1755 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1756 return;
1757 }
1758
1759 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1760 SpillSize, RI, *MFI);
1761 MFI->setHasSpilledVGPRs();
1762
1763 BuildMI(MBB, MI, DL, get(Opcode))
1764 .addReg(SrcReg, getKillRegState(isKill)) // data
1765 .addFrameIndex(FrameIndex) // addr
1766 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1767 .addImm(0) // offset
1768 .addMemOperand(MMO);
1769}
1770
1771static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1772 switch (Size) {
1773 case 4:
1774 return AMDGPU::SI_SPILL_S32_RESTORE;
1775 case 8:
1776 return AMDGPU::SI_SPILL_S64_RESTORE;
1777 case 12:
1778 return AMDGPU::SI_SPILL_S96_RESTORE;
1779 case 16:
1780 return AMDGPU::SI_SPILL_S128_RESTORE;
1781 case 20:
1782 return AMDGPU::SI_SPILL_S160_RESTORE;
1783 case 24:
1784 return AMDGPU::SI_SPILL_S192_RESTORE;
1785 case 28:
1786 return AMDGPU::SI_SPILL_S224_RESTORE;
1787 case 32:
1788 return AMDGPU::SI_SPILL_S256_RESTORE;
1789 case 36:
1790 return AMDGPU::SI_SPILL_S288_RESTORE;
1791 case 40:
1792 return AMDGPU::SI_SPILL_S320_RESTORE;
1793 case 44:
1794 return AMDGPU::SI_SPILL_S352_RESTORE;
1795 case 48:
1796 return AMDGPU::SI_SPILL_S384_RESTORE;
1797 case 64:
1798 return AMDGPU::SI_SPILL_S512_RESTORE;
1799 case 128:
1800 return AMDGPU::SI_SPILL_S1024_RESTORE;
1801 default:
1802 llvm_unreachable("unknown register size");
1803 }
1804}
1805
1806static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1807 switch (Size) {
1808 case 4:
1809 return AMDGPU::SI_SPILL_V32_RESTORE;
1810 case 8:
1811 return AMDGPU::SI_SPILL_V64_RESTORE;
1812 case 12:
1813 return AMDGPU::SI_SPILL_V96_RESTORE;
1814 case 16:
1815 return AMDGPU::SI_SPILL_V128_RESTORE;
1816 case 20:
1817 return AMDGPU::SI_SPILL_V160_RESTORE;
1818 case 24:
1819 return AMDGPU::SI_SPILL_V192_RESTORE;
1820 case 28:
1821 return AMDGPU::SI_SPILL_V224_RESTORE;
1822 case 32:
1823 return AMDGPU::SI_SPILL_V256_RESTORE;
1824 case 36:
1825 return AMDGPU::SI_SPILL_V288_RESTORE;
1826 case 40:
1827 return AMDGPU::SI_SPILL_V320_RESTORE;
1828 case 44:
1829 return AMDGPU::SI_SPILL_V352_RESTORE;
1830 case 48:
1831 return AMDGPU::SI_SPILL_V384_RESTORE;
1832 case 64:
1833 return AMDGPU::SI_SPILL_V512_RESTORE;
1834 case 128:
1835 return AMDGPU::SI_SPILL_V1024_RESTORE;
1836 default:
1837 llvm_unreachable("unknown register size");
1838 }
1839}
1840
1841static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1842 switch (Size) {
1843 case 4:
1844 return AMDGPU::SI_SPILL_A32_RESTORE;
1845 case 8:
1846 return AMDGPU::SI_SPILL_A64_RESTORE;
1847 case 12:
1848 return AMDGPU::SI_SPILL_A96_RESTORE;
1849 case 16:
1850 return AMDGPU::SI_SPILL_A128_RESTORE;
1851 case 20:
1852 return AMDGPU::SI_SPILL_A160_RESTORE;
1853 case 24:
1854 return AMDGPU::SI_SPILL_A192_RESTORE;
1855 case 28:
1856 return AMDGPU::SI_SPILL_A224_RESTORE;
1857 case 32:
1858 return AMDGPU::SI_SPILL_A256_RESTORE;
1859 case 36:
1860 return AMDGPU::SI_SPILL_A288_RESTORE;
1861 case 40:
1862 return AMDGPU::SI_SPILL_A320_RESTORE;
1863 case 44:
1864 return AMDGPU::SI_SPILL_A352_RESTORE;
1865 case 48:
1866 return AMDGPU::SI_SPILL_A384_RESTORE;
1867 case 64:
1868 return AMDGPU::SI_SPILL_A512_RESTORE;
1869 case 128:
1870 return AMDGPU::SI_SPILL_A1024_RESTORE;
1871 default:
1872 llvm_unreachable("unknown register size");
1873 }
1874}
1875
1876static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1877 switch (Size) {
1878 case 4:
1879 return AMDGPU::SI_SPILL_AV32_RESTORE;
1880 case 8:
1881 return AMDGPU::SI_SPILL_AV64_RESTORE;
1882 case 12:
1883 return AMDGPU::SI_SPILL_AV96_RESTORE;
1884 case 16:
1885 return AMDGPU::SI_SPILL_AV128_RESTORE;
1886 case 20:
1887 return AMDGPU::SI_SPILL_AV160_RESTORE;
1888 case 24:
1889 return AMDGPU::SI_SPILL_AV192_RESTORE;
1890 case 28:
1891 return AMDGPU::SI_SPILL_AV224_RESTORE;
1892 case 32:
1893 return AMDGPU::SI_SPILL_AV256_RESTORE;
1894 case 36:
1895 return AMDGPU::SI_SPILL_AV288_RESTORE;
1896 case 40:
1897 return AMDGPU::SI_SPILL_AV320_RESTORE;
1898 case 44:
1899 return AMDGPU::SI_SPILL_AV352_RESTORE;
1900 case 48:
1901 return AMDGPU::SI_SPILL_AV384_RESTORE;
1902 case 64:
1903 return AMDGPU::SI_SPILL_AV512_RESTORE;
1904 case 128:
1905 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1906 default:
1907 llvm_unreachable("unknown register size");
1908 }
1909}
1910
1911static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1912 bool IsVectorSuperClass) {
1913 // Currently, there is only 32-bit WWM register spills needed.
1914 if (Size != 4)
1915 llvm_unreachable("unknown wwm register spill size");
1916
1917 if (IsVectorSuperClass)
1918 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1919
1920 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1921}
1922
1923static unsigned
1925 unsigned Size, const SIRegisterInfo &TRI,
1926 const SIMachineFunctionInfo &MFI) {
1927 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1928
1929 // Choose the right opcode if restoring a WWM register.
1931 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1932
1933 if (IsVectorSuperClass)
1935
1936 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1938}
1939
1942 Register DestReg, int FrameIndex,
1943 const TargetRegisterClass *RC,
1944 const TargetRegisterInfo *TRI,
1945 Register VReg) const {
1948 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1949 const DebugLoc &DL = MBB.findDebugLoc(MI);
1950 unsigned SpillSize = TRI->getSpillSize(*RC);
1951
1952 MachinePointerInfo PtrInfo
1953 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1954
1956 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1957 FrameInfo.getObjectAlign(FrameIndex));
1958
1959 if (RI.isSGPRClass(RC)) {
1960 MFI->setHasSpilledSGPRs();
1961 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1962 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1963 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1964
1965 // FIXME: Maybe this should not include a memoperand because it will be
1966 // lowered to non-memory instructions.
1967 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1968 if (DestReg.isVirtual() && SpillSize == 4) {
1970 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1971 }
1972
1973 if (RI.spillSGPRToVGPR())
1974 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1975 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1976 .addFrameIndex(FrameIndex) // addr
1977 .addMemOperand(MMO)
1979
1980 return;
1981 }
1982
1983 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1984 SpillSize, RI, *MFI);
1985 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1986 .addFrameIndex(FrameIndex) // vaddr
1987 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1988 .addImm(0) // offset
1989 .addMemOperand(MMO);
1990}
1991
1994 insertNoops(MBB, MI, 1);
1995}
1996
1999 unsigned Quantity) const {
2001 while (Quantity > 0) {
2002 unsigned Arg = std::min(Quantity, 8u);
2003 Quantity -= Arg;
2004 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2005 }
2006}
2007
2009 auto MF = MBB.getParent();
2011
2012 assert(Info->isEntryFunction());
2013
2014 if (MBB.succ_empty()) {
2015 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2016 if (HasNoTerminator) {
2017 if (Info->returnsVoid()) {
2018 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2019 } else {
2020 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2021 }
2022 }
2023 }
2024}
2025
2029 const DebugLoc &DL) const {
2031 constexpr unsigned DoorbellIDMask = 0x3ff;
2032 constexpr unsigned ECQueueWaveAbort = 0x400;
2033
2034 MachineBasicBlock *TrapBB = &MBB;
2035 MachineBasicBlock *ContBB = &MBB;
2036 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2037
2038 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2039 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2040 TrapBB = MF->CreateMachineBasicBlock();
2041 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2042 MF->push_back(TrapBB);
2043 MBB.addSuccessor(TrapBB);
2044 }
2045
2046 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2047 // will be a nop.
2048 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2049 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2050 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2051 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2052 DoorbellReg)
2054 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2055 .addUse(AMDGPU::M0);
2056 Register DoorbellRegMasked =
2057 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2058 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2059 .addUse(DoorbellReg)
2060 .addImm(DoorbellIDMask);
2061 Register SetWaveAbortBit =
2062 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2063 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2064 .addUse(DoorbellRegMasked)
2065 .addImm(ECQueueWaveAbort);
2066 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2067 .addUse(SetWaveAbortBit);
2068 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2070 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2071 .addUse(AMDGPU::TTMP2);
2072 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2073 TrapBB->addSuccessor(HaltLoopBB);
2074
2075 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2076 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2077 .addMBB(HaltLoopBB);
2078 MF->push_back(HaltLoopBB);
2079 HaltLoopBB->addSuccessor(HaltLoopBB);
2080
2081 return ContBB;
2082}
2083
2085 switch (MI.getOpcode()) {
2086 default:
2087 if (MI.isMetaInstruction())
2088 return 0;
2089 return 1; // FIXME: Do wait states equal cycles?
2090
2091 case AMDGPU::S_NOP:
2092 return MI.getOperand(0).getImm() + 1;
2093 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2094 // hazard, even if one exist, won't really be visible. Should we handle it?
2095 }
2096}
2097
2099 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2100 MachineBasicBlock &MBB = *MI.getParent();
2102 switch (MI.getOpcode()) {
2103 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2104 case AMDGPU::S_MOV_B64_term:
2105 // This is only a terminator to get the correct spill code placement during
2106 // register allocation.
2107 MI.setDesc(get(AMDGPU::S_MOV_B64));
2108 break;
2109
2110 case AMDGPU::S_MOV_B32_term:
2111 // This is only a terminator to get the correct spill code placement during
2112 // register allocation.
2113 MI.setDesc(get(AMDGPU::S_MOV_B32));
2114 break;
2115
2116 case AMDGPU::S_XOR_B64_term:
2117 // This is only a terminator to get the correct spill code placement during
2118 // register allocation.
2119 MI.setDesc(get(AMDGPU::S_XOR_B64));
2120 break;
2121
2122 case AMDGPU::S_XOR_B32_term:
2123 // This is only a terminator to get the correct spill code placement during
2124 // register allocation.
2125 MI.setDesc(get(AMDGPU::S_XOR_B32));
2126 break;
2127 case AMDGPU::S_OR_B64_term:
2128 // This is only a terminator to get the correct spill code placement during
2129 // register allocation.
2130 MI.setDesc(get(AMDGPU::S_OR_B64));
2131 break;
2132 case AMDGPU::S_OR_B32_term:
2133 // This is only a terminator to get the correct spill code placement during
2134 // register allocation.
2135 MI.setDesc(get(AMDGPU::S_OR_B32));
2136 break;
2137
2138 case AMDGPU::S_ANDN2_B64_term:
2139 // This is only a terminator to get the correct spill code placement during
2140 // register allocation.
2141 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2142 break;
2143
2144 case AMDGPU::S_ANDN2_B32_term:
2145 // This is only a terminator to get the correct spill code placement during
2146 // register allocation.
2147 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2148 break;
2149
2150 case AMDGPU::S_AND_B64_term:
2151 // This is only a terminator to get the correct spill code placement during
2152 // register allocation.
2153 MI.setDesc(get(AMDGPU::S_AND_B64));
2154 break;
2155
2156 case AMDGPU::S_AND_B32_term:
2157 // This is only a terminator to get the correct spill code placement during
2158 // register allocation.
2159 MI.setDesc(get(AMDGPU::S_AND_B32));
2160 break;
2161
2162 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2163 // This is only a terminator to get the correct spill code placement during
2164 // register allocation.
2165 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2166 break;
2167
2168 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2169 // This is only a terminator to get the correct spill code placement during
2170 // register allocation.
2171 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2172 break;
2173
2174 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2175 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2176 break;
2177
2178 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2179 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2180 break;
2181
2182 case AMDGPU::V_MOV_B64_PSEUDO: {
2183 Register Dst = MI.getOperand(0).getReg();
2184 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2185 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2186
2187 const MachineOperand &SrcOp = MI.getOperand(1);
2188 // FIXME: Will this work for 64-bit floating point immediates?
2189 assert(!SrcOp.isFPImm());
2190 if (ST.hasMovB64()) {
2191 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2192 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2193 isUInt<32>(SrcOp.getImm()))
2194 break;
2195 }
2196 if (SrcOp.isImm()) {
2197 APInt Imm(64, SrcOp.getImm());
2198 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2199 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2200 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2201 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2203 .addImm(Lo.getSExtValue())
2205 .addImm(Lo.getSExtValue())
2206 .addImm(0) // op_sel_lo
2207 .addImm(0) // op_sel_hi
2208 .addImm(0) // neg_lo
2209 .addImm(0) // neg_hi
2210 .addImm(0); // clamp
2211 } else {
2212 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2213 .addImm(Lo.getSExtValue())
2215 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2216 .addImm(Hi.getSExtValue())
2218 }
2219 } else {
2220 assert(SrcOp.isReg());
2221 if (ST.hasPkMovB32() &&
2222 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2223 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2224 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2225 .addReg(SrcOp.getReg())
2227 .addReg(SrcOp.getReg())
2228 .addImm(0) // op_sel_lo
2229 .addImm(0) // op_sel_hi
2230 .addImm(0) // neg_lo
2231 .addImm(0) // neg_hi
2232 .addImm(0); // clamp
2233 } else {
2234 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2235 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2237 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2238 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2240 }
2241 }
2242 MI.eraseFromParent();
2243 break;
2244 }
2245 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2247 break;
2248 }
2249 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2250 const MachineOperand &SrcOp = MI.getOperand(1);
2251 assert(!SrcOp.isFPImm());
2252 APInt Imm(64, SrcOp.getImm());
2253 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2254 MI.setDesc(get(AMDGPU::S_MOV_B64));
2255 break;
2256 }
2257
2258 Register Dst = MI.getOperand(0).getReg();
2259 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2260 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2261
2262 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2263 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2264 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2265 .addImm(Lo.getSExtValue())
2267 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2268 .addImm(Hi.getSExtValue())
2270 MI.eraseFromParent();
2271 break;
2272 }
2273 case AMDGPU::V_SET_INACTIVE_B32: {
2274 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2275 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2276 // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2277 // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2278 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2279 .add(MI.getOperand(1));
2280 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2281 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2282 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2283 .add(MI.getOperand(2));
2284 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2285 .addReg(Exec);
2286 MI.eraseFromParent();
2287 break;
2288 }
2289 case AMDGPU::V_SET_INACTIVE_B64: {
2290 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2291 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2292 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2293 MI.getOperand(0).getReg())
2294 .add(MI.getOperand(1));
2295 expandPostRAPseudo(*Copy);
2296 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2297 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2298 Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2299 MI.getOperand(0).getReg())
2300 .add(MI.getOperand(2));
2301 expandPostRAPseudo(*Copy);
2302 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2303 .addReg(Exec);
2304 MI.eraseFromParent();
2305 break;
2306 }
2307 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2308 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2309 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2310 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2311 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2317 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2318 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2324 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2325 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2326 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2327 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2328 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2329 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2330 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2331 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2332 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2333 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2334 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2335 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2336 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2337
2338 unsigned Opc;
2339 if (RI.hasVGPRs(EltRC)) {
2340 Opc = AMDGPU::V_MOVRELD_B32_e32;
2341 } else {
2342 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2343 : AMDGPU::S_MOVRELD_B32;
2344 }
2345
2346 const MCInstrDesc &OpDesc = get(Opc);
2347 Register VecReg = MI.getOperand(0).getReg();
2348 bool IsUndef = MI.getOperand(1).isUndef();
2349 unsigned SubReg = MI.getOperand(3).getImm();
2350 assert(VecReg == MI.getOperand(1).getReg());
2351
2353 BuildMI(MBB, MI, DL, OpDesc)
2354 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2355 .add(MI.getOperand(2))
2357 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2358
2359 const int ImpDefIdx =
2360 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2361 const int ImpUseIdx = ImpDefIdx + 1;
2362 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2363 MI.eraseFromParent();
2364 break;
2365 }
2366 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2367 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2368 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2369 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2370 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2371 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2372 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2373 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2374 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2375 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2376 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2377 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2379 Register VecReg = MI.getOperand(0).getReg();
2380 bool IsUndef = MI.getOperand(1).isUndef();
2381 Register Idx = MI.getOperand(3).getReg();
2382 Register SubReg = MI.getOperand(4).getImm();
2383
2384 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2385 .addReg(Idx)
2387 SetOn->getOperand(3).setIsUndef();
2388
2389 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2391 BuildMI(MBB, MI, DL, OpDesc)
2392 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2393 .add(MI.getOperand(2))
2395 .addReg(VecReg,
2396 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2397
2398 const int ImpDefIdx =
2399 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2400 const int ImpUseIdx = ImpDefIdx + 1;
2401 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2402
2403 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2404
2405 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2406
2407 MI.eraseFromParent();
2408 break;
2409 }
2410 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2411 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2412 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2413 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2414 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2415 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2416 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2417 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2418 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2419 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2420 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2421 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2423 Register Dst = MI.getOperand(0).getReg();
2424 Register VecReg = MI.getOperand(1).getReg();
2425 bool IsUndef = MI.getOperand(1).isUndef();
2426 Register Idx = MI.getOperand(2).getReg();
2427 Register SubReg = MI.getOperand(3).getImm();
2428
2429 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2430 .addReg(Idx)
2432 SetOn->getOperand(3).setIsUndef();
2433
2434 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2435 .addDef(Dst)
2436 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2437 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2438
2439 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2440
2441 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2442
2443 MI.eraseFromParent();
2444 break;
2445 }
2446 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2447 MachineFunction &MF = *MBB.getParent();
2448 Register Reg = MI.getOperand(0).getReg();
2449 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2450 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2451 MachineOperand OpLo = MI.getOperand(1);
2452 MachineOperand OpHi = MI.getOperand(2);
2453
2454 // Create a bundle so these instructions won't be re-ordered by the
2455 // post-RA scheduler.
2456 MIBundleBuilder Bundler(MBB, MI);
2457 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2458
2459 // What we want here is an offset from the value returned by s_getpc (which
2460 // is the address of the s_add_u32 instruction) to the global variable, but
2461 // since the encoding of $symbol starts 4 bytes after the start of the
2462 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2463 // small. This requires us to add 4 to the global variable offset in order
2464 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2465 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2466 // instruction.
2467
2468 int64_t Adjust = 0;
2469 if (ST.hasGetPCZeroExtension()) {
2470 // Fix up hardware that does not sign-extend the 48-bit PC value by
2471 // inserting: s_sext_i32_i16 reghi, reghi
2472 Bundler.append(
2473 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2474 Adjust += 4;
2475 }
2476
2477 if (OpLo.isGlobal())
2478 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2479 Bundler.append(
2480 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2481
2482 if (OpHi.isGlobal())
2483 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2484 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2485 .addReg(RegHi)
2486 .add(OpHi));
2487
2488 finalizeBundle(MBB, Bundler.begin());
2489
2490 MI.eraseFromParent();
2491 break;
2492 }
2493 case AMDGPU::ENTER_STRICT_WWM: {
2494 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2495 // Whole Wave Mode is entered.
2496 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2497 : AMDGPU::S_OR_SAVEEXEC_B64));
2498 break;
2499 }
2500 case AMDGPU::ENTER_STRICT_WQM: {
2501 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2502 // STRICT_WQM is entered.
2503 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2504 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2505 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2506 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2507 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2508
2509 MI.eraseFromParent();
2510 break;
2511 }
2512 case AMDGPU::EXIT_STRICT_WWM:
2513 case AMDGPU::EXIT_STRICT_WQM: {
2514 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2515 // WWM/STICT_WQM is exited.
2516 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2517 break;
2518 }
2519 case AMDGPU::SI_RETURN: {
2520 const MachineFunction *MF = MBB.getParent();
2521 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2522 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2523 // Hiding the return address use with SI_RETURN may lead to extra kills in
2524 // the function and missing live-ins. We are fine in practice because callee
2525 // saved register handling ensures the register value is restored before
2526 // RET, but we need the undef flag here to appease the MachineVerifier
2527 // liveness checks.
2529 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2530 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2531
2532 MIB.copyImplicitOps(MI);
2533 MI.eraseFromParent();
2534 break;
2535 }
2536
2537 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2538 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2539 MI.setDesc(get(AMDGPU::S_MUL_U64));
2540 break;
2541
2542 case AMDGPU::S_GETPC_B64_pseudo:
2543 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2544 if (ST.hasGetPCZeroExtension()) {
2545 Register Dst = MI.getOperand(0).getReg();
2546 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2547 // Fix up hardware that does not sign-extend the 48-bit PC value by
2548 // inserting: s_sext_i32_i16 dsthi, dsthi
2549 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2550 DstHi)
2551 .addReg(DstHi);
2552 }
2553 break;
2554 }
2555 return true;
2556}
2557
2560 unsigned SubIdx, const MachineInstr &Orig,
2561 const TargetRegisterInfo &RI) const {
2562
2563 // Try shrinking the instruction to remat only the part needed for current
2564 // context.
2565 // TODO: Handle more cases.
2566 unsigned Opcode = Orig.getOpcode();
2567 switch (Opcode) {
2568 case AMDGPU::S_LOAD_DWORDX16_IMM:
2569 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2570 if (SubIdx != 0)
2571 break;
2572
2573 if (I == MBB.end())
2574 break;
2575
2576 if (I->isBundled())
2577 break;
2578
2579 // Look for a single use of the register that is also a subreg.
2580 Register RegToFind = Orig.getOperand(0).getReg();
2581 MachineOperand *UseMO = nullptr;
2582 for (auto &CandMO : I->operands()) {
2583 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2584 continue;
2585 if (UseMO) {
2586 UseMO = nullptr;
2587 break;
2588 }
2589 UseMO = &CandMO;
2590 }
2591 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2592 break;
2593
2594 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2595 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2596
2599 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2600
2601 unsigned NewOpcode = -1;
2602 if (SubregSize == 256)
2603 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2604 else if (SubregSize == 128)
2605 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2606 else
2607 break;
2608
2609 const MCInstrDesc &TID = get(NewOpcode);
2610 const TargetRegisterClass *NewRC =
2611 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2612 MRI.setRegClass(DestReg, NewRC);
2613
2614 UseMO->setReg(DestReg);
2615 UseMO->setSubReg(AMDGPU::NoSubRegister);
2616
2617 // Use a smaller load with the desired size, possibly with updated offset.
2618 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2619 MI->setDesc(TID);
2620 MI->getOperand(0).setReg(DestReg);
2621 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2622 if (Offset) {
2623 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2624 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2625 OffsetMO->setImm(FinalOffset);
2626 }
2628 for (const MachineMemOperand *MemOp : Orig.memoperands())
2629 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2630 SubregSize / 8));
2631 MI->setMemRefs(*MF, NewMMOs);
2632
2633 MBB.insert(I, MI);
2634 return;
2635 }
2636
2637 default:
2638 break;
2639 }
2640
2641 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2642}
2643
2644std::pair<MachineInstr*, MachineInstr*>
2646 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2647
2648 if (ST.hasMovB64() &&
2650 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2651 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2652 return std::pair(&MI, nullptr);
2653 }
2654
2655 MachineBasicBlock &MBB = *MI.getParent();
2659 Register Dst = MI.getOperand(0).getReg();
2660 unsigned Part = 0;
2661 MachineInstr *Split[2];
2662
2663 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2664 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2665 if (Dst.isPhysical()) {
2666 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2667 } else {
2668 assert(MRI.isSSA());
2669 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2670 MovDPP.addDef(Tmp);
2671 }
2672
2673 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2674 const MachineOperand &SrcOp = MI.getOperand(I);
2675 assert(!SrcOp.isFPImm());
2676 if (SrcOp.isImm()) {
2677 APInt Imm(64, SrcOp.getImm());
2678 Imm.ashrInPlace(Part * 32);
2679 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2680 } else {
2681 assert(SrcOp.isReg());
2682 Register Src = SrcOp.getReg();
2683 if (Src.isPhysical())
2684 MovDPP.addReg(RI.getSubReg(Src, Sub));
2685 else
2686 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2687 }
2688 }
2689
2690 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2691 MovDPP.addImm(MO.getImm());
2692
2693 Split[Part] = MovDPP;
2694 ++Part;
2695 }
2696
2697 if (Dst.isVirtual())
2698 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2699 .addReg(Split[0]->getOperand(0).getReg())
2700 .addImm(AMDGPU::sub0)
2701 .addReg(Split[1]->getOperand(0).getReg())
2702 .addImm(AMDGPU::sub1);
2703
2704 MI.eraseFromParent();
2705 return std::pair(Split[0], Split[1]);
2706}
2707
2708std::optional<DestSourcePair>
2710 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2711 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2712
2713 return std::nullopt;
2714}
2715
2717 MachineOperand &Src0,
2718 unsigned Src0OpName,
2719 MachineOperand &Src1,
2720 unsigned Src1OpName) const {
2721 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2722 if (!Src0Mods)
2723 return false;
2724
2725 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2726 assert(Src1Mods &&
2727 "All commutable instructions have both src0 and src1 modifiers");
2728
2729 int Src0ModsVal = Src0Mods->getImm();
2730 int Src1ModsVal = Src1Mods->getImm();
2731
2732 Src1Mods->setImm(Src0ModsVal);
2733 Src0Mods->setImm(Src1ModsVal);
2734 return true;
2735}
2736
2738 MachineOperand &RegOp,
2739 MachineOperand &NonRegOp) {
2740 Register Reg = RegOp.getReg();
2741 unsigned SubReg = RegOp.getSubReg();
2742 bool IsKill = RegOp.isKill();
2743 bool IsDead = RegOp.isDead();
2744 bool IsUndef = RegOp.isUndef();
2745 bool IsDebug = RegOp.isDebug();
2746
2747 if (NonRegOp.isImm())
2748 RegOp.ChangeToImmediate(NonRegOp.getImm());
2749 else if (NonRegOp.isFI())
2750 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2751 else if (NonRegOp.isGlobal()) {
2752 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2753 NonRegOp.getTargetFlags());
2754 } else
2755 return nullptr;
2756
2757 // Make sure we don't reinterpret a subreg index in the target flags.
2758 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2759
2760 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2761 NonRegOp.setSubReg(SubReg);
2762
2763 return &MI;
2764}
2765
2767 unsigned Src0Idx,
2768 unsigned Src1Idx) const {
2769 assert(!NewMI && "this should never be used");
2770
2771 unsigned Opc = MI.getOpcode();
2772 int CommutedOpcode = commuteOpcode(Opc);
2773 if (CommutedOpcode == -1)
2774 return nullptr;
2775
2776 if (Src0Idx > Src1Idx)
2777 std::swap(Src0Idx, Src1Idx);
2778
2779 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2780 static_cast<int>(Src0Idx) &&
2781 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2782 static_cast<int>(Src1Idx) &&
2783 "inconsistency with findCommutedOpIndices");
2784
2785 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2786 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2787
2788 MachineInstr *CommutedMI = nullptr;
2789 if (Src0.isReg() && Src1.isReg()) {
2790 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2791 // Be sure to copy the source modifiers to the right place.
2792 CommutedMI
2793 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2794 }
2795
2796 } else if (Src0.isReg() && !Src1.isReg()) {
2797 // src0 should always be able to support any operand type, so no need to
2798 // check operand legality.
2799 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2800 } else if (!Src0.isReg() && Src1.isReg()) {
2801 if (isOperandLegal(MI, Src1Idx, &Src0))
2802 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2803 } else {
2804 // FIXME: Found two non registers to commute. This does happen.
2805 return nullptr;
2806 }
2807
2808 if (CommutedMI) {
2809 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2810 Src1, AMDGPU::OpName::src1_modifiers);
2811
2812 CommutedMI->setDesc(get(CommutedOpcode));
2813 }
2814
2815 return CommutedMI;
2816}
2817
2818// This needs to be implemented because the source modifiers may be inserted
2819// between the true commutable operands, and the base
2820// TargetInstrInfo::commuteInstruction uses it.
2822 unsigned &SrcOpIdx0,
2823 unsigned &SrcOpIdx1) const {
2824 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2825}
2826
2828 unsigned &SrcOpIdx0,
2829 unsigned &SrcOpIdx1) const {
2830 if (!Desc.isCommutable())
2831 return false;
2832
2833 unsigned Opc = Desc.getOpcode();
2834 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2835 if (Src0Idx == -1)
2836 return false;
2837
2838 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2839 if (Src1Idx == -1)
2840 return false;
2841
2842 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2843}
2844
2846 int64_t BrOffset) const {
2847 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2848 // block is unanalyzable.
2849 assert(BranchOp != AMDGPU::S_SETPC_B64);
2850
2851 // Convert to dwords.
2852 BrOffset /= 4;
2853
2854 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2855 // from the next instruction.
2856 BrOffset -= 1;
2857
2858 return isIntN(BranchOffsetBits, BrOffset);
2859}
2860
2863 return MI.getOperand(0).getMBB();
2864}
2865
2867 for (const MachineInstr &MI : MBB->terminators()) {
2868 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
2869 MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2870 MI.getOpcode() == AMDGPU::SI_LOOP)
2871 return true;
2872 }
2873 return false;
2874}
2875
2877 MachineBasicBlock &DestBB,
2878 MachineBasicBlock &RestoreBB,
2879 const DebugLoc &DL, int64_t BrOffset,
2880 RegScavenger *RS) const {
2881 assert(RS && "RegScavenger required for long branching");
2882 assert(MBB.empty() &&
2883 "new block should be inserted for expanding unconditional branch");
2884 assert(MBB.pred_size() == 1);
2885 assert(RestoreBB.empty() &&
2886 "restore block should be inserted for restoring clobbered registers");
2887
2891
2892 // FIXME: Virtual register workaround for RegScavenger not working with empty
2893 // blocks.
2894 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2895
2896 auto I = MBB.end();
2897
2898 // We need to compute the offset relative to the instruction immediately after
2899 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2900 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2901
2902 auto &MCCtx = MF->getContext();
2903 MCSymbol *PostGetPCLabel =
2904 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2905 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2906
2907 MCSymbol *OffsetLo =
2908 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2909 MCSymbol *OffsetHi =
2910 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2911 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2912 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2913 .addReg(PCReg, 0, AMDGPU::sub0)
2914 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2915 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2916 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2917 .addReg(PCReg, 0, AMDGPU::sub1)
2918 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2919
2920 // Insert the indirect branch after the other terminator.
2921 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2922 .addReg(PCReg);
2923
2924 // If a spill is needed for the pc register pair, we need to insert a spill
2925 // restore block right before the destination block, and insert a short branch
2926 // into the old destination block's fallthrough predecessor.
2927 // e.g.:
2928 //
2929 // s_cbranch_scc0 skip_long_branch:
2930 //
2931 // long_branch_bb:
2932 // spill s[8:9]
2933 // s_getpc_b64 s[8:9]
2934 // s_add_u32 s8, s8, restore_bb
2935 // s_addc_u32 s9, s9, 0
2936 // s_setpc_b64 s[8:9]
2937 //
2938 // skip_long_branch:
2939 // foo;
2940 //
2941 // .....
2942 //
2943 // dest_bb_fallthrough_predecessor:
2944 // bar;
2945 // s_branch dest_bb
2946 //
2947 // restore_bb:
2948 // restore s[8:9]
2949 // fallthrough dest_bb
2950 ///
2951 // dest_bb:
2952 // buzz;
2953
2954 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2955 Register Scav;
2956
2957 // If we've previously reserved a register for long branches
2958 // avoid running the scavenger and just use those registers
2959 if (LongBranchReservedReg) {
2960 RS->enterBasicBlock(MBB);
2961 Scav = LongBranchReservedReg;
2962 } else {
2964 Scav = RS->scavengeRegisterBackwards(
2965 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2966 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2967 }
2968 if (Scav) {
2969 RS->setRegUsed(Scav);
2970 MRI.replaceRegWith(PCReg, Scav);
2971 MRI.clearVirtRegs();
2972 } else {
2973 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2974 // SGPR spill.
2975 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2976 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2977 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2978 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2979 MRI.clearVirtRegs();
2980 }
2981
2982 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2983 // Now, the distance could be defined.
2985 MCSymbolRefExpr::create(DestLabel, MCCtx),
2986 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2987 // Add offset assignments.
2988 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2989 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2990 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2991 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2992}
2993
2994unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2995 switch (Cond) {
2996 case SIInstrInfo::SCC_TRUE:
2997 return AMDGPU::S_CBRANCH_SCC1;
2998 case SIInstrInfo::SCC_FALSE:
2999 return AMDGPU::S_CBRANCH_SCC0;
3000 case SIInstrInfo::VCCNZ:
3001 return AMDGPU::S_CBRANCH_VCCNZ;
3002 case SIInstrInfo::VCCZ:
3003 return AMDGPU::S_CBRANCH_VCCZ;
3004 case SIInstrInfo::EXECNZ:
3005 return AMDGPU::S_CBRANCH_EXECNZ;
3006 case SIInstrInfo::EXECZ:
3007 return AMDGPU::S_CBRANCH_EXECZ;
3008 default:
3009 llvm_unreachable("invalid branch predicate");
3010 }
3011}
3012
3013SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3014 switch (Opcode) {
3015 case AMDGPU::S_CBRANCH_SCC0:
3016 return SCC_FALSE;
3017 case AMDGPU::S_CBRANCH_SCC1:
3018 return SCC_TRUE;
3019 case AMDGPU::S_CBRANCH_VCCNZ:
3020 return VCCNZ;
3021 case AMDGPU::S_CBRANCH_VCCZ:
3022 return VCCZ;
3023 case AMDGPU::S_CBRANCH_EXECNZ:
3024 return EXECNZ;
3025 case AMDGPU::S_CBRANCH_EXECZ:
3026 return EXECZ;
3027 default:
3028 return INVALID_BR;
3029 }
3030}
3031
3035 MachineBasicBlock *&FBB,
3037 bool AllowModify) const {
3038 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3039 // Unconditional Branch
3040 TBB = I->getOperand(0).getMBB();
3041 return false;
3042 }
3043
3044 MachineBasicBlock *CondBB = nullptr;
3045
3046 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
3047 CondBB = I->getOperand(1).getMBB();
3048 Cond.push_back(I->getOperand(0));
3049 } else {
3050 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3051 if (Pred == INVALID_BR)
3052 return true;
3053
3054 CondBB = I->getOperand(0).getMBB();
3055 Cond.push_back(MachineOperand::CreateImm(Pred));
3056 Cond.push_back(I->getOperand(1)); // Save the branch register.
3057 }
3058 ++I;
3059
3060 if (I == MBB.end()) {
3061 // Conditional branch followed by fall-through.
3062 TBB = CondBB;
3063 return false;
3064 }
3065
3066 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3067 TBB = CondBB;
3068 FBB = I->getOperand(0).getMBB();
3069 return false;
3070 }
3071
3072 return true;
3073}
3074
3076 MachineBasicBlock *&FBB,
3078 bool AllowModify) const {
3080 auto E = MBB.end();
3081 if (I == E)
3082 return false;
3083
3084 // Skip over the instructions that are artificially terminators for special
3085 // exec management.
3086 while (I != E && !I->isBranch() && !I->isReturn()) {
3087 switch (I->getOpcode()) {
3088 case AMDGPU::S_MOV_B64_term:
3089 case AMDGPU::S_XOR_B64_term:
3090 case AMDGPU::S_OR_B64_term:
3091 case AMDGPU::S_ANDN2_B64_term:
3092 case AMDGPU::S_AND_B64_term:
3093 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3094 case AMDGPU::S_MOV_B32_term:
3095 case AMDGPU::S_XOR_B32_term:
3096 case AMDGPU::S_OR_B32_term:
3097 case AMDGPU::S_ANDN2_B32_term:
3098 case AMDGPU::S_AND_B32_term:
3099 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3100 break;
3101 case AMDGPU::SI_IF:
3102 case AMDGPU::SI_ELSE:
3103 case AMDGPU::SI_KILL_I1_TERMINATOR:
3104 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3105 // FIXME: It's messy that these need to be considered here at all.
3106 return true;
3107 default:
3108 llvm_unreachable("unexpected non-branch terminator inst");
3109 }
3110
3111 ++I;
3112 }
3113
3114 if (I == E)
3115 return false;
3116
3117 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3118}
3119
3121 int *BytesRemoved) const {
3122 unsigned Count = 0;
3123 unsigned RemovedSize = 0;
3125 // Skip over artificial terminators when removing instructions.
3126 if (MI.isBranch() || MI.isReturn()) {
3127 RemovedSize += getInstSizeInBytes(MI);
3128 MI.eraseFromParent();
3129 ++Count;
3130 }
3131 }
3132
3133 if (BytesRemoved)
3134 *BytesRemoved = RemovedSize;
3135
3136 return Count;
3137}
3138
3139// Copy the flags onto the implicit condition register operand.
3141 const MachineOperand &OrigCond) {
3142 CondReg.setIsUndef(OrigCond.isUndef());
3143 CondReg.setIsKill(OrigCond.isKill());
3144}
3145
3148 MachineBasicBlock *FBB,
3150 const DebugLoc &DL,
3151 int *BytesAdded) const {
3152 if (!FBB && Cond.empty()) {
3153 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3154 .addMBB(TBB);
3155 if (BytesAdded)
3156 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3157 return 1;
3158 }
3159
3160 if(Cond.size() == 1 && Cond[0].isReg()) {
3161 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
3162 .add(Cond[0])
3163 .addMBB(TBB);
3164 return 1;
3165 }
3166
3167 assert(TBB && Cond[0].isImm());
3168
3169 unsigned Opcode
3170 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3171
3172 if (!FBB) {
3173 MachineInstr *CondBr =
3174 BuildMI(&MBB, DL, get(Opcode))
3175 .addMBB(TBB);
3176
3177 // Copy the flags onto the implicit condition register operand.
3178 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3179 fixImplicitOperands(*CondBr);
3180
3181 if (BytesAdded)
3182 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3183 return 1;
3184 }
3185
3186 assert(TBB && FBB);
3187
3188 MachineInstr *CondBr =
3189 BuildMI(&MBB, DL, get(Opcode))
3190 .addMBB(TBB);
3191 fixImplicitOperands(*CondBr);
3192 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3193 .addMBB(FBB);
3194
3195 MachineOperand &CondReg = CondBr->getOperand(1);
3196 CondReg.setIsUndef(Cond[1].isUndef());
3197 CondReg.setIsKill(Cond[1].isKill());
3198
3199 if (BytesAdded)
3200 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3201
3202 return 2;
3203}
3204
3207 if (Cond.size() != 2) {
3208 return true;
3209 }
3210
3211 if (Cond[0].isImm()) {
3212 Cond[0].setImm(-Cond[0].getImm());
3213 return false;
3214 }
3215
3216 return true;
3217}
3218
3221 Register DstReg, Register TrueReg,
3222 Register FalseReg, int &CondCycles,
3223 int &TrueCycles, int &FalseCycles) const {
3224 switch (Cond[0].getImm()) {
3225 case VCCNZ:
3226 case VCCZ: {
3228 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3229 if (MRI.getRegClass(FalseReg) != RC)
3230 return false;
3231
3232 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3233 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3234
3235 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3236 return RI.hasVGPRs(RC) && NumInsts <= 6;
3237 }
3238 case SCC_TRUE:
3239 case SCC_FALSE: {
3240 // FIXME: We could insert for VGPRs if we could replace the original compare
3241 // with a vector one.
3243 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3244 if (MRI.getRegClass(FalseReg) != RC)
3245 return false;
3246
3247 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3248
3249 // Multiples of 8 can do s_cselect_b64
3250 if (NumInsts % 2 == 0)
3251 NumInsts /= 2;
3252
3253 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3254 return RI.isSGPRClass(RC);
3255 }
3256 default:
3257 return false;
3258 }
3259}
3260
3264 Register TrueReg, Register FalseReg) const {
3265 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3266 if (Pred == VCCZ || Pred == SCC_FALSE) {
3267 Pred = static_cast<BranchPredicate>(-Pred);
3268 std::swap(TrueReg, FalseReg);
3269 }
3270
3272 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3273 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3274
3275 if (DstSize == 32) {
3277 if (Pred == SCC_TRUE) {
3278 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3279 .addReg(TrueReg)
3280 .addReg(FalseReg);
3281 } else {
3282 // Instruction's operands are backwards from what is expected.
3283 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3284 .addReg(FalseReg)
3285 .addReg(TrueReg);
3286 }
3287
3288 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3289 return;
3290 }
3291
3292 if (DstSize == 64 && Pred == SCC_TRUE) {
3294 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3295 .addReg(TrueReg)
3296 .addReg(FalseReg);
3297
3298 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3299 return;
3300 }
3301
3302 static const int16_t Sub0_15[] = {
3303 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3304 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3305 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3306 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3307 };
3308
3309 static const int16_t Sub0_15_64[] = {
3310 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3311 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3312 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3313 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3314 };
3315
3316 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3317 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3318 const int16_t *SubIndices = Sub0_15;
3319 int NElts = DstSize / 32;
3320
3321 // 64-bit select is only available for SALU.
3322 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3323 if (Pred == SCC_TRUE) {
3324 if (NElts % 2) {
3325 SelOp = AMDGPU::S_CSELECT_B32;
3326 EltRC = &AMDGPU::SGPR_32RegClass;
3327 } else {
3328 SelOp = AMDGPU::S_CSELECT_B64;
3329 EltRC = &AMDGPU::SGPR_64RegClass;
3330 SubIndices = Sub0_15_64;
3331 NElts /= 2;
3332 }
3333 }
3334
3336 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3337
3338 I = MIB->getIterator();
3339
3341 for (int Idx = 0; Idx != NElts; ++Idx) {
3342 Register DstElt = MRI.createVirtualRegister(EltRC);
3343 Regs.push_back(DstElt);
3344
3345 unsigned SubIdx = SubIndices[Idx];
3346
3348 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3349 Select =
3350 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3351 .addReg(FalseReg, 0, SubIdx)
3352 .addReg(TrueReg, 0, SubIdx);
3353 } else {
3354 Select =
3355 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3356 .addReg(TrueReg, 0, SubIdx)
3357 .addReg(FalseReg, 0, SubIdx);
3358 }
3359
3360 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3362
3363 MIB.addReg(DstElt)
3364 .addImm(SubIdx);
3365 }
3366}
3367
3369 switch (MI.getOpcode()) {
3370 case AMDGPU::V_MOV_B32_e32:
3371 case AMDGPU::V_MOV_B32_e64:
3372 case AMDGPU::V_MOV_B64_PSEUDO:
3373 case AMDGPU::V_MOV_B64_e32:
3374 case AMDGPU::V_MOV_B64_e64:
3375 case AMDGPU::S_MOV_B32:
3376 case AMDGPU::S_MOV_B64:
3377 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3378 case AMDGPU::COPY:
3379 case AMDGPU::WWM_COPY:
3380 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3381 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3382 case AMDGPU::V_ACCVGPR_MOV_B32:
3383 return true;
3384 default:
3385 return false;
3386 }
3387}
3388
3389static constexpr unsigned ModifierOpNames[] = {
3390 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3391 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3392 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3393
3395 unsigned Opc = MI.getOpcode();
3396 for (unsigned Name : reverse(ModifierOpNames)) {
3398 if (Idx >= 0)
3399 MI.removeOperand(Idx);
3400 }
3401}
3402
3404 Register Reg, MachineRegisterInfo *MRI) const {
3405 if (!MRI->hasOneNonDBGUse(Reg))
3406 return false;
3407
3408 switch (DefMI.getOpcode()) {
3409 default:
3410 return false;
3411 case AMDGPU::V_MOV_B64_e32:
3412 case AMDGPU::S_MOV_B64:
3413 case AMDGPU::V_MOV_B64_PSEUDO:
3414 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3415 case AMDGPU::V_MOV_B32_e32:
3416 case AMDGPU::S_MOV_B32:
3417 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3418 break;
3419 }
3420
3421 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3422 assert(ImmOp);
3423 // FIXME: We could handle FrameIndex values here.
3424 if (!ImmOp->isImm())
3425 return false;
3426
3427 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3428 int64_t Imm = ImmOp->getImm();
3429 switch (UseOp.getSubReg()) {
3430 default:
3431 return Imm;
3432 case AMDGPU::sub0:
3433 return Lo_32(Imm);
3434 case AMDGPU::sub1:
3435 return Hi_32(Imm);
3436 case AMDGPU::lo16:
3437 return APInt(16, Imm).getSExtValue();
3438 case AMDGPU::hi16:
3439 return APInt(32, Imm).ashr(16).getSExtValue();
3440 case AMDGPU::sub1_lo16:
3441 return APInt(16, Hi_32(Imm)).getSExtValue();
3442 case AMDGPU::sub1_hi16:
3443 return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();
3444 }
3445 };
3446
3447 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3448
3449 unsigned Opc = UseMI.getOpcode();
3450 if (Opc == AMDGPU::COPY) {
3451 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3452
3453 Register DstReg = UseMI.getOperand(0).getReg();
3454 unsigned OpSize = getOpSize(UseMI, 0);
3455 bool Is16Bit = OpSize == 2;
3456 bool Is64Bit = OpSize == 8;
3457 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3458 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3459 : AMDGPU::V_MOV_B32_e32
3460 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3461 : AMDGPU::S_MOV_B32;
3462 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
3463
3464 if (RI.isAGPR(*MRI, DstReg)) {
3465 if (Is64Bit || !isInlineConstant(Imm))
3466 return false;
3467 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3468 }
3469
3470 if (Is16Bit) {
3471 if (isVGPRCopy)
3472 return false; // Do not clobber vgpr_hi16
3473
3474 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3475 return false;
3476
3477 UseMI.getOperand(0).setSubReg(0);
3478 if (DstReg.isPhysical()) {
3479 DstReg = RI.get32BitRegister(DstReg);
3480 UseMI.getOperand(0).setReg(DstReg);
3481 }
3482 assert(UseMI.getOperand(1).getReg().isVirtual());
3483 }
3484
3485 const MCInstrDesc &NewMCID = get(NewOpc);
3486 if (DstReg.isPhysical() &&
3487 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3488 return false;
3489
3490 UseMI.setDesc(NewMCID);
3491 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3492 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3493 return true;
3494 }
3495
3496 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3497 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3498 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3499 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3500 Opc == AMDGPU::V_FMAC_F16_t16_e64) {
3501 // Don't fold if we are using source or output modifiers. The new VOP2
3502 // instructions don't have them.
3504 return false;
3505
3506 // If this is a free constant, there's no reason to do this.
3507 // TODO: We could fold this here instead of letting SIFoldOperands do it
3508 // later.
3509 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3510
3511 // Any src operand can be used for the legality check.
3512 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3513 return false;
3514
3515 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3516 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3517 bool IsFMA =
3518 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3519 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3520 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3521 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3522 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3523
3524 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3525 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3526 (Src1->isReg() && Src1->getReg() == Reg)) {
3527 MachineOperand *RegSrc =
3528 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3529 if (!RegSrc->isReg())
3530 return false;
3531 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3532 ST.getConstantBusLimit(Opc) < 2)
3533 return false;
3534
3535 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3536 return false;
3537
3538 // If src2 is also a literal constant then we have to choose which one to
3539 // fold. In general it is better to choose madak so that the other literal
3540 // can be materialized in an sgpr instead of a vgpr:
3541 // s_mov_b32 s0, literal
3542 // v_madak_f32 v0, s0, v0, literal
3543 // Instead of:
3544 // v_mov_b32 v1, literal
3545 // v_madmk_f32 v0, v0, literal, v1
3546 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3547 if (Def && Def->isMoveImmediate() &&
3548 !isInlineConstant(Def->getOperand(1)))
3549 return false;
3550
3551 unsigned NewOpc =
3552 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3553 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3554 : AMDGPU::V_FMAMK_F16)
3555 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3556 if (pseudoToMCOpcode(NewOpc) == -1)
3557 return false;
3558
3559 // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3560 // would also require restricting their register classes. For now
3561 // just bail out.
3562 if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
3563 return false;
3564
3565 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3566
3567 // FIXME: This would be a lot easier if we could return a new instruction
3568 // instead of having to modify in place.
3569
3570 Register SrcReg = RegSrc->getReg();
3571 unsigned SrcSubReg = RegSrc->getSubReg();
3572 Src0->setReg(SrcReg);
3573 Src0->setSubReg(SrcSubReg);
3574 Src0->setIsKill(RegSrc->isKill());
3575
3576 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3577 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3578 Opc == AMDGPU::V_FMAC_F16_e64)
3579 UseMI.untieRegOperand(
3580 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3581
3582 Src1->ChangeToImmediate(Imm);
3583
3585 UseMI.setDesc(get(NewOpc));
3586
3587 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3588 if (DeleteDef)
3589 DefMI.eraseFromParent();
3590
3591 return true;
3592 }
3593
3594 // Added part is the constant: Use v_madak_{f16, f32}.
3595 if (Src2->isReg() && Src2->getReg() == Reg) {
3596 if (ST.getConstantBusLimit(Opc) < 2) {
3597 // Not allowed to use constant bus for another operand.
3598 // We can however allow an inline immediate as src0.
3599 bool Src0Inlined = false;
3600 if (Src0->isReg()) {
3601 // Try to inline constant if possible.
3602 // If the Def moves immediate and the use is single
3603 // We are saving VGPR here.
3604 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3605 if (Def && Def->isMoveImmediate() &&
3606 isInlineConstant(Def->getOperand(1)) &&
3607 MRI->hasOneUse(Src0->getReg())) {
3608 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3609 Src0Inlined = true;
3610 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3611 RI.isSGPRReg(*MRI, Src0->getReg())) {
3612 return false;
3613 }
3614 // VGPR is okay as Src0 - fallthrough
3615 }
3616
3617 if (Src1->isReg() && !Src0Inlined) {
3618 // We have one slot for inlinable constant so far - try to fill it
3619 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3620 if (Def && Def->isMoveImmediate() &&
3621 isInlineConstant(Def->getOperand(1)) &&
3622 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3623 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3624 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3625 return false;
3626 // VGPR is okay as Src1 - fallthrough
3627 }
3628 }
3629
3630 unsigned NewOpc =
3631 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3632 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3633 : AMDGPU::V_FMAAK_F16)
3634 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3635 if (pseudoToMCOpcode(NewOpc) == -1)
3636 return false;
3637
3638 // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3639 // would also require restricting their register classes. For now
3640 // just bail out.
3641 if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
3642 return false;
3643
3644 // FIXME: This would be a lot easier if we could return a new instruction
3645 // instead of having to modify in place.
3646
3647 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3648 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3649 Opc == AMDGPU::V_FMAC_F16_e64)
3650 UseMI.untieRegOperand(
3651 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3652
3653 // ChangingToImmediate adds Src2 back to the instruction.
3654 Src2->ChangeToImmediate(getImmFor(*Src2));
3655
3656 // These come before src2.
3658 UseMI.setDesc(get(NewOpc));
3659 // It might happen that UseMI was commuted
3660 // and we now have SGPR as SRC1. If so 2 inlined
3661 // constant and SGPR are illegal.
3663
3664 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3665 if (DeleteDef)
3666 DefMI.eraseFromParent();
3667
3668 return true;
3669 }
3670 }
3671
3672 return false;
3673}
3674
3675static bool
3678 if (BaseOps1.size() != BaseOps2.size())
3679 return false;
3680 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3681 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3682 return false;
3683 }
3684 return true;
3685}
3686
3687static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3688 LocationSize WidthB, int OffsetB) {
3689 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3690 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3691 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3692 return LowWidth.hasValue() &&
3693 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3694}
3695
3696bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3697 const MachineInstr &MIb) const {
3698 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3699 int64_t Offset0, Offset1;
3700 LocationSize Dummy0 = 0, Dummy1 = 0;
3701 bool Offset0IsScalable, Offset1IsScalable;
3702 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3703 Dummy0, &RI) ||
3704 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3705 Dummy1, &RI))
3706 return false;
3707
3708 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3709 return false;
3710
3711 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3712 // FIXME: Handle ds_read2 / ds_write2.
3713 return false;
3714 }
3715 LocationSize Width0 = MIa.memoperands().front()->getSize();
3716 LocationSize Width1 = MIb.memoperands().front()->getSize();
3717 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3718}
3719
3721 const MachineInstr &MIb) const {
3722 assert(MIa.mayLoadOrStore() &&
3723 "MIa must load from or modify a memory location");
3724 assert(MIb.mayLoadOrStore() &&
3725 "MIb must load from or modify a memory location");
3726
3728 return false;
3729
3730 // XXX - Can we relax this between address spaces?
3731 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3732 return false;
3733
3734 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3735 return false;
3736
3737 // TODO: Should we check the address space from the MachineMemOperand? That
3738 // would allow us to distinguish objects we know don't alias based on the
3739 // underlying address space, even if it was lowered to a different one,
3740 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3741 // buffer.
3742 if (isDS(MIa)) {
3743 if (isDS(MIb))
3744 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3745
3746 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3747 }
3748
3749 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3750 if (isMUBUF(MIb) || isMTBUF(MIb))
3751 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3752
3753 if (isFLAT(MIb))
3754 return isFLATScratch(MIb);
3755
3756 return !isSMRD(MIb);
3757 }
3758
3759 if (isSMRD(MIa)) {
3760 if (isSMRD(MIb))
3761 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3762
3763 if (isFLAT(MIb))
3764 return isFLATScratch(MIb);
3765
3766 return !isMUBUF(MIb) && !isMTBUF(MIb);
3767 }
3768
3769 if (isFLAT(MIa)) {
3770 if (isFLAT(MIb)) {
3771 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3772 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3773 return true;
3774
3775 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3776 }
3777
3778 return false;
3779 }
3780
3781 return false;
3782}
3783
3785 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3786 if (Reg.isPhysical())
3787 return false;
3788 auto *Def = MRI.getUniqueVRegDef(Reg);
3789 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3790 Imm = Def->getOperand(1).getImm();
3791 if (DefMI)
3792 *DefMI = Def;
3793 return true;
3794 }
3795 return false;
3796}
3797
3798static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3799 MachineInstr **DefMI = nullptr) {
3800 if (!MO->isReg())
3801 return false;
3802 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3803 const MachineRegisterInfo &MRI = MF->getRegInfo();
3804 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3805}
3806
3808 MachineInstr &NewMI) {
3809 if (LV) {
3810 unsigned NumOps = MI.getNumOperands();
3811 for (unsigned I = 1; I < NumOps; ++I) {
3812 MachineOperand &Op = MI.getOperand(I);
3813 if (Op.isReg() && Op.isKill())
3814 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3815 }
3816 }
3817}
3818
3820 LiveVariables *LV,
3821 LiveIntervals *LIS) const {
3822 MachineBasicBlock &MBB = *MI.getParent();
3823 unsigned Opc = MI.getOpcode();
3824
3825 // Handle MFMA.
3826 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3827 if (NewMFMAOpc != -1) {
3829 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3830 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3831 MIB.add(MI.getOperand(I));
3832 updateLiveVariables(LV, MI, *MIB);
3833 if (LIS) {
3834 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3835 // SlotIndex of defs needs to be updated when converting to early-clobber
3836 MachineOperand &Def = MIB->getOperand(0);
3837 if (Def.isEarlyClobber() && Def.isReg() &&
3838 LIS->hasInterval(Def.getReg())) {
3839 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3840 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3841 auto &LI = LIS->getInterval(Def.getReg());
3842 auto UpdateDefIndex = [&](LiveRange &LR) {
3843 auto S = LR.find(OldIndex);
3844 if (S != LR.end() && S->start == OldIndex) {
3845 assert(S->valno && S->valno->def == OldIndex);
3846 S->start = NewIndex;
3847 S->valno->def = NewIndex;
3848 }
3849 };
3850 UpdateDefIndex(LI);
3851 for (auto &SR : LI.subranges())
3852 UpdateDefIndex(SR);
3853 }
3854 }
3855 return MIB;
3856 }
3857
3858 if (SIInstrInfo::isWMMA(MI)) {
3859 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3860 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3861 .setMIFlags(MI.getFlags());
3862 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3863 MIB->addOperand(MI.getOperand(I));
3864
3865 updateLiveVariables(LV, MI, *MIB);
3866 if (LIS)
3867 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3868
3869 return MIB;
3870 }
3871
3872 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3873 "V_FMAC_F16_t16_e32 is not supported and not expected to be present "
3874 "pre-RA");
3875
3876 // Handle MAC/FMAC.
3877 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3878 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3879 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3880 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3881 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3882 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3883 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3884 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3885 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3886 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3887 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3888 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3889 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3890 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3891 bool Src0Literal = false;
3892
3893 switch (Opc) {
3894 default:
3895 return nullptr;
3896 case AMDGPU::V_MAC_F16_e64:
3897 case AMDGPU::V_FMAC_F16_e64:
3898 case AMDGPU::V_FMAC_F16_t16_e64:
3899 case AMDGPU::V_MAC_F32_e64:
3900 case AMDGPU::V_MAC_LEGACY_F32_e64:
3901 case AMDGPU::V_FMAC_F32_e64:
3902 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3903 case AMDGPU::V_FMAC_F64_e64:
3904 break;
3905 case AMDGPU::V_MAC_F16_e32:
3906 case AMDGPU::V_FMAC_F16_e32:
3907 case AMDGPU::V_MAC_F32_e32:
3908 case AMDGPU::V_MAC_LEGACY_F32_e32:
3909 case AMDGPU::V_FMAC_F32_e32:
3910 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3911 case AMDGPU::V_FMAC_F64_e32: {
3912 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3913 AMDGPU::OpName::src0);
3914 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3915 if (!Src0->isReg() && !Src0->isImm())
3916 return nullptr;
3917
3918 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3919 Src0Literal = true;
3920
3921 break;
3922 }
3923 }
3924
3926 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3927 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3928 const MachineOperand *Src0Mods =
3929 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3930 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3931 const MachineOperand *Src1Mods =
3932 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3933 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3934 const MachineOperand *Src2Mods =
3935 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3936 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3937 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3938 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3939
3940 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3941 !IsLegacy &&
3942 // If we have an SGPR input, we will violate the constant bus restriction.
3943 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3944 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3946 const auto killDef = [&]() -> void {
3948 // The only user is the instruction which will be killed.
3949 Register DefReg = DefMI->getOperand(0).getReg();
3950 if (!MRI.hasOneNonDBGUse(DefReg))
3951 return;
3952 // We cannot just remove the DefMI here, calling pass will crash.
3953 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3954 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3956 if (LV)
3957 LV->getVarInfo(DefReg).AliveBlocks.clear();
3958 };
3959
3960 int64_t Imm;
3961 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3962 unsigned NewOpc =
3963 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3964 : AMDGPU::V_FMAAK_F16)
3965 : AMDGPU::V_FMAAK_F32)
3966 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3967 if (pseudoToMCOpcode(NewOpc) != -1) {
3968 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3969 .add(*Dst)
3970 .add(*Src0)
3971 .add(*Src1)
3972 .addImm(Imm)
3973 .setMIFlags(MI.getFlags());
3974 updateLiveVariables(LV, MI, *MIB);
3975 if (LIS)
3976 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3977 killDef();
3978 return MIB;
3979 }
3980 }
3981 unsigned NewOpc =
3982 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3983 : AMDGPU::V_FMAMK_F16)
3984 : AMDGPU::V_FMAMK_F32)
3985 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3986 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3987 if (pseudoToMCOpcode(NewOpc) != -1) {
3988 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3989 .add(*Dst)
3990 .add(*Src0)
3991 .addImm(Imm)
3992 .add(*Src2)
3993 .setMIFlags(MI.getFlags());
3994 updateLiveVariables(LV, MI, *MIB);
3995 if (LIS)
3996 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3997 killDef();
3998 return MIB;
3999 }
4000 }
4001 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4002 if (Src0Literal) {
4003 Imm = Src0->getImm();
4004 DefMI = nullptr;
4005 }
4006 if (pseudoToMCOpcode(NewOpc) != -1 &&
4008 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4009 Src1)) {
4010 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4011 .add(*Dst)
4012 .add(*Src1)
4013 .addImm(Imm)
4014 .add(*Src2)
4015 .setMIFlags(MI.getFlags());
4016 updateLiveVariables(LV, MI, *MIB);
4017 if (LIS)
4018 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4019 if (DefMI)
4020 killDef();
4021 return MIB;
4022 }
4023 }
4024 }
4025
4026 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4027 // if VOP3 does not allow a literal operand.
4028 if (Src0Literal && !ST.hasVOP3Literal())
4029 return nullptr;
4030
4031 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
4032 : IsF64 ? AMDGPU::V_FMA_F64_e64
4033 : IsLegacy
4034 ? AMDGPU::V_FMA_LEGACY_F32_e64
4035 : AMDGPU::V_FMA_F32_e64
4036 : IsF16 ? AMDGPU::V_MAD_F16_e64
4037 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
4038 : AMDGPU::V_MAD_F32_e64;
4039 if (pseudoToMCOpcode(NewOpc) == -1)
4040 return nullptr;
4041
4042 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4043 .add(*Dst)
4044 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4045 .add(*Src0)
4046 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4047 .add(*Src1)
4048 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4049 .add(*Src2)
4050 .addImm(Clamp ? Clamp->getImm() : 0)
4051 .addImm(Omod ? Omod->getImm() : 0)
4052 .setMIFlags(MI.getFlags());
4053 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4054 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4055 updateLiveVariables(LV, MI, *MIB);
4056 if (LIS)
4057 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4058 return MIB;
4059}
4060
4061// It's not generally safe to move VALU instructions across these since it will
4062// start using the register as a base index rather than directly.
4063// XXX - Why isn't hasSideEffects sufficient for these?
4065 switch (MI.getOpcode()) {
4066 case AMDGPU::S_SET_GPR_IDX_ON:
4067 case AMDGPU::S_SET_GPR_IDX_MODE:
4068 case AMDGPU::S_SET_GPR_IDX_OFF:
4069 return true;
4070 default:
4071 return false;
4072 }
4073}
4074
4076 const MachineBasicBlock *MBB,
4077 const MachineFunction &MF) const {
4078 // Skipping the check for SP writes in the base implementation. The reason it
4079 // was added was apparently due to compile time concerns.
4080 //
4081 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4082 // but is probably avoidable.
4083
4084 // Copied from base implementation.
4085 // Terminators and labels can't be scheduled around.
4086 if (MI.isTerminator() || MI.isPosition())
4087 return true;
4088
4089 // INLINEASM_BR can jump to another block
4090 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4091 return true;
4092
4093 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4094 return true;
4095
4096 // Target-independent instructions do not have an implicit-use of EXEC, even
4097 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4098 // boundaries prevents incorrect movements of such instructions.
4099 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4100 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4101 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4102 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4104}
4105
4107 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4108}
4109
4111 // Skip the full operand and register alias search modifiesRegister
4112 // does. There's only a handful of instructions that touch this, it's only an
4113 // implicit def, and doesn't alias any other registers.
4114 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4115}
4116
4118 unsigned Opcode = MI.getOpcode();
4119
4120 if (MI.mayStore() && isSMRD(MI))
4121 return true; // scalar store or atomic
4122
4123 // This will terminate the function when other lanes may need to continue.
4124 if (MI.isReturn())
4125 return true;
4126
4127 // These instructions cause shader I/O that may cause hardware lockups
4128 // when executed with an empty EXEC mask.
4129 //
4130 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4131 // EXEC = 0, but checking for that case here seems not worth it
4132 // given the typical code patterns.
4133 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4134 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4135 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4136 return true;
4137
4138 if (MI.isCall() || MI.isInlineAsm())
4139 return true; // conservative assumption
4140
4141 // Assume that barrier interactions are only intended with active lanes.
4142 if (isBarrier(Opcode))
4143 return true;
4144
4145 // A mode change is a scalar operation that influences vector instructions.
4147 return true;
4148
4149 // These are like SALU instructions in terms of effects, so it's questionable
4150 // whether we should return true for those.
4151 //
4152 // However, executing them with EXEC = 0 causes them to operate on undefined
4153 // data, which we avoid by returning true here.
4154 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4155 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4156 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4157 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4158 return true;
4159
4160 return false;
4161}
4162
4164 const MachineInstr &MI) const {
4165 if (MI.isMetaInstruction())
4166 return false;
4167
4168 // This won't read exec if this is an SGPR->SGPR copy.
4169 if (MI.isCopyLike()) {
4170 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4171 return true;
4172
4173 // Make sure this isn't copying exec as a normal operand
4174 return MI.readsRegister(AMDGPU::EXEC, &RI);
4175 }
4176
4177 // Make a conservative assumption about the callee.
4178 if (MI.isCall())
4179 return true;
4180
4181 // Be conservative with any unhandled generic opcodes.
4182 if (!isTargetSpecificOpcode(MI.getOpcode()))
4183 return true;
4184
4185 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4186}
4187
4188bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4189 switch (Imm.getBitWidth()) {
4190 case 1: // This likely will be a condition code mask.
4191 return true;
4192
4193 case 32:
4194 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4195 ST.hasInv2PiInlineImm());
4196 case 64:
4197 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4198 ST.hasInv2PiInlineImm());
4199 case 16:
4200 return ST.has16BitInsts() &&
4201 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4202 ST.hasInv2PiInlineImm());
4203 default:
4204 llvm_unreachable("invalid bitwidth");
4205 }
4206}
4207
4209 APInt IntImm = Imm.bitcastToAPInt();
4210 int64_t IntImmVal = IntImm.getSExtValue();
4211 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4212 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4213 default:
4214 llvm_unreachable("invalid fltSemantics");
4217 return isInlineConstant(IntImm);
4219 return ST.has16BitInsts() &&
4220 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4222 return ST.has16BitInsts() &&
4223 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4224 }
4225}
4226
4228 uint8_t OperandType) const {
4229 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4230 if (!MO.isImm())
4231 return false;
4232
4233 // MachineOperand provides no way to tell the true operand size, since it only
4234 // records a 64-bit value. We need to know the size to determine if a 32-bit
4235 // floating point immediate bit pattern is legal for an integer immediate. It
4236 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4237
4238 int64_t Imm = MO.getImm();
4239 switch (OperandType) {
4252 int32_t Trunc = static_cast<int32_t>(Imm);
4254 }
4261 ST.hasInv2PiInlineImm());
4265 // We would expect inline immediates to not be concerned with an integer/fp
4266 // distinction. However, in the case of 16-bit integer operations, the
4267 // "floating point" values appear to not work. It seems read the low 16-bits
4268 // of 32-bit immediates, which happens to always work for the integer
4269 // values.
4270 //
4271 // See llvm bugzilla 46302.
4272 //
4273 // TODO: Theoretically we could use op-sel to use the high bits of the
4274 // 32-bit FP values.
4292 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4293 // A few special case instructions have 16-bit operands on subtargets
4294 // where 16-bit instructions are not legal.
4295 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4296 // constants in these cases
4297 int16_t Trunc = static_cast<int16_t>(Imm);
4298 return ST.has16BitInsts() &&
4300 }
4301
4302 return false;
4303 }
4308 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4309 int16_t Trunc = static_cast<int16_t>(Imm);
4310 return ST.has16BitInsts() &&
4312 }
4313 return false;
4314 }
4317 return false;
4320 // Always embedded in the instruction for free.
4321 return true;
4331 // Just ignore anything else.
4332 return true;
4333 default:
4334 llvm_unreachable("invalid operand type");
4335 }
4336}
4337
4338static bool compareMachineOp(const MachineOperand &Op0,
4339 const MachineOperand &Op1) {
4340 if (Op0.getType() != Op1.getType())
4341 return false;
4342
4343 switch (Op0.getType()) {
4345 return Op0.getReg() == Op1.getReg();
4347 return Op0.getImm() == Op1.getImm();
4348 default:
4349 llvm_unreachable("Didn't expect to be comparing these operand types");
4350 }
4351}
4352
4354 const MachineOperand &MO) const {
4355 const MCInstrDesc &InstDesc = MI.getDesc();
4356 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4357
4358 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4359
4361 return true;
4362
4363 if (OpInfo.RegClass < 0)
4364 return false;
4365
4366 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4367 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4368 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4369 AMDGPU::OpName::src2))
4370 return false;
4371 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4372 }
4373
4374 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4375 return false;
4376
4377 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4378 return true;
4379
4380 return ST.hasVOP3Literal();
4381}
4382
4383bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4384 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4385 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4386 return false;
4387
4388 int Op32 = AMDGPU::getVOPe32(Opcode);
4389 if (Op32 == -1)
4390 return false;
4391
4392 return pseudoToMCOpcode(Op32) != -1;
4393}
4394
4395bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4396 // The src0_modifier operand is present on all instructions
4397 // that have modifiers.
4398
4399 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4400}
4401
4403 unsigned OpName) const {
4404 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4405 return Mods && Mods->getImm();
4406}
4407
4409 return any_of(ModifierOpNames,
4410 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4411}
4412
4414 const MachineRegisterInfo &MRI) const {
4415 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4416 // Can't shrink instruction with three operands.
4417 if (Src2) {
4418 switch (MI.getOpcode()) {
4419 default: return false;
4420
4421 case AMDGPU::V_ADDC_U32_e64:
4422 case AMDGPU::V_SUBB_U32_e64:
4423 case AMDGPU::V_SUBBREV_U32_e64: {
4424 const MachineOperand *Src1
4425 = getNamedOperand(MI, AMDGPU::OpName::src1);
4426 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4427 return false;
4428 // Additional verification is needed for sdst/src2.
4429 return true;
4430 }
4431 case AMDGPU::V_MAC_F16_e64:
4432 case AMDGPU::V_MAC_F32_e64:
4433 case AMDGPU::V_MAC_LEGACY_F32_e64:
4434 case AMDGPU::V_FMAC_F16_e64:
4435 case AMDGPU::V_FMAC_F16_t16_e64:
4436 case AMDGPU::V_FMAC_F32_e64:
4437 case AMDGPU::V_FMAC_F64_e64:
4438 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4439 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4440 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4441 return false;
4442 break;
4443
4444 case AMDGPU::V_CNDMASK_B32_e64:
4445 break;
4446 }
4447 }
4448
4449 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4450 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4451 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4452 return false;
4453
4454 // We don't need to check src0, all input types are legal, so just make sure
4455 // src0 isn't using any modifiers.
4456 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4457 return false;
4458
4459 // Can it be shrunk to a valid 32 bit opcode?
4460 if (!hasVALU32BitEncoding(MI.getOpcode()))
4461 return false;
4462
4463 // Check output modifiers
4464 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4465 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4466 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel);
4467}
4468
4469// Set VCC operand with all flags from \p Orig, except for setting it as
4470// implicit.
4472 const MachineOperand &Orig) {
4473
4474 for (MachineOperand &Use : MI.implicit_operands()) {
4475 if (Use.isUse() &&
4476 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4477 Use.setIsUndef(Orig.isUndef());
4478 Use.setIsKill(Orig.isKill());
4479 return;
4480 }
4481 }
4482}
4483
4485 unsigned Op32) const {
4486 MachineBasicBlock *MBB = MI.getParent();
4487
4488 const MCInstrDesc &Op32Desc = get(Op32);
4489 MachineInstrBuilder Inst32 =
4490 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4491 .setMIFlags(MI.getFlags());
4492
4493 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4494 // For VOPC instructions, this is replaced by an implicit def of vcc.
4495
4496 // We assume the defs of the shrunk opcode are in the same order, and the
4497 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4498 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4499 Inst32.add(MI.getOperand(I));
4500
4501 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4502
4503 int Idx = MI.getNumExplicitDefs();
4504 for (const MachineOperand &Use : MI.explicit_uses()) {
4505 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4507 continue;
4508
4509 if (&Use == Src2) {
4510 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4511 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4512 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4513 // of vcc was already added during the initial BuildMI, but we
4514 // 1) may need to change vcc to vcc_lo to preserve the original register
4515 // 2) have to preserve the original flags.
4516 fixImplicitOperands(*Inst32);
4517 copyFlagsToImplicitVCC(*Inst32, *Src2);
4518 continue;
4519 }
4520 }
4521
4522 Inst32.add(Use);
4523 }
4524
4525 // FIXME: Losing implicit operands
4526
4527 return Inst32;
4528}
4529
4531 const MachineOperand &MO,
4532 const MCOperandInfo &OpInfo) const {
4533 // Literal constants use the constant bus.
4534 if (!MO.isReg())
4535 return !isInlineConstant(MO, OpInfo);
4536
4537 if (!MO.isUse())
4538 return false;
4539
4540 if (MO.getReg().isVirtual())
4541 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4542
4543 // Null is free
4544 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4545 return false;
4546
4547 // SGPRs use the constant bus
4548 if (MO.isImplicit()) {
4549 return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
4550 MO.getReg() == AMDGPU::VCC_LO;
4551 }
4552 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4553 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4554}
4555
4557 for (const MachineOperand &MO : MI.implicit_operands()) {
4558 // We only care about reads.
4559 if (MO.isDef())
4560 continue;
4561
4562 switch (MO.getReg()) {
4563 case AMDGPU::VCC:
4564 case AMDGPU::VCC_LO:
4565 case AMDGPU::VCC_HI:
4566 case AMDGPU::M0:
4567 case AMDGPU::FLAT_SCR:
4568 return MO.getReg();
4569
4570 default:
4571 break;
4572 }
4573 }
4574
4575 return Register();
4576}
4577
4578static bool shouldReadExec(const MachineInstr &MI) {
4579 if (SIInstrInfo::isVALU(MI)) {
4580 switch (MI.getOpcode()) {
4581 case AMDGPU::V_READLANE_B32:
4582 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4583 case AMDGPU::V_WRITELANE_B32:
4584 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4585 return false;
4586 }
4587
4588 return true;
4589 }
4590
4591 if (MI.isPreISelOpcode() ||
4592 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4595 return false;
4596
4597 return true;
4598}
4599
4600static bool isSubRegOf(const SIRegisterInfo &TRI,
4601 const MachineOperand &SuperVec,
4602 const MachineOperand &SubReg) {
4603 if (SubReg.getReg().isPhysical())
4604 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4605
4606 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4607 SubReg.getReg() == SuperVec.getReg();
4608}
4609
4611 StringRef &ErrInfo) const {
4612 uint16_t Opcode = MI.getOpcode();
4613 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4614 return true;
4615
4616 const MachineFunction *MF = MI.getParent()->getParent();
4617 const MachineRegisterInfo &MRI = MF->getRegInfo();
4618
4619 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4620 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4621 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4622 int Src3Idx = -1;
4623 if (Src0Idx == -1) {
4624 // VOPD V_DUAL_* instructions use different operand names.
4625 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4626 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4627 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4628 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4629 }
4630
4631 // Make sure the number of operands is correct.
4632 const MCInstrDesc &Desc = get(Opcode);
4633 if (!Desc.isVariadic() &&
4634 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4635 ErrInfo = "Instruction has wrong number of operands.";
4636 return false;
4637 }
4638
4639 if (MI.isInlineAsm()) {
4640 // Verify register classes for inlineasm constraints.
4641 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4642 I != E; ++I) {
4643 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4644 if (!RC)
4645 continue;
4646
4647 const MachineOperand &Op = MI.getOperand(I);
4648 if (!Op.isReg())
4649 continue;
4650
4651 Register Reg = Op.getReg();
4652 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4653 ErrInfo = "inlineasm operand has incorrect register class.";
4654 return false;
4655 }
4656 }
4657
4658 return true;
4659 }
4660
4661 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4662 ErrInfo = "missing memory operand from image instruction.";
4663 return false;
4664 }
4665
4666 // Make sure the register classes are correct.
4667 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4668 const MachineOperand &MO = MI.getOperand(i);
4669 if (MO.isFPImm()) {
4670 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4671 "all fp values to integers.";
4672 return false;
4673 }
4674
4675 int RegClass = Desc.operands()[i].RegClass;
4676
4677 switch (Desc.operands()[i].OperandType) {
4679 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4680 ErrInfo = "Illegal immediate value for operand.";
4681 return false;
4682 }
4683 break;
4688 break;
4700 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4701 ErrInfo = "Illegal immediate value for operand.";
4702 return false;
4703 }
4704 break;
4705 }
4707 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4708 ErrInfo = "Expected inline constant for operand.";
4709 return false;
4710 }
4711 break;
4714 // Check if this operand is an immediate.
4715 // FrameIndex operands will be replaced by immediates, so they are
4716 // allowed.
4717 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4718 ErrInfo = "Expected immediate, but got non-immediate";
4719 return false;
4720 }
4721 [[fallthrough]];
4722 default:
4723 continue;
4724 }
4725
4726 if (!MO.isReg())
4727 continue;
4728 Register Reg = MO.getReg();
4729 if (!Reg)
4730 continue;
4731
4732 // FIXME: Ideally we would have separate instruction definitions with the
4733 // aligned register constraint.
4734 // FIXME: We do not verify inline asm operands, but custom inline asm
4735 // verification is broken anyway
4736 if (ST.needsAlignedVGPRs()) {
4737 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4738 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4739 const TargetRegisterClass *SubRC =
4740 RI.getSubRegisterClass(RC, MO.getSubReg());
4741 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4742 if (RC)
4743 RC = SubRC;
4744 }
4745
4746 // Check that this is the aligned version of the class.
4747 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4748 ErrInfo = "Subtarget requires even aligned vector registers";
4749 return false;
4750 }
4751 }
4752
4753 if (RegClass != -1) {
4754 if (Reg.isVirtual())
4755 continue;
4756
4757 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4758 if (!RC->contains(Reg)) {
4759 ErrInfo = "Operand has incorrect register class.";
4760 return false;
4761 }
4762 }
4763 }
4764
4765 // Verify SDWA
4766 if (isSDWA(MI)) {
4767 if (!ST.hasSDWA()) {
4768 ErrInfo = "SDWA is not supported on this target";
4769 return false;
4770 }
4771
4772 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4773
4774 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4775 if (OpIdx == -1)
4776 continue;
4777 const MachineOperand &MO = MI.getOperand(OpIdx);
4778
4779 if (!ST.hasSDWAScalar()) {
4780 // Only VGPRS on VI
4781 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4782 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4783 return false;
4784 }
4785 } else {
4786 // No immediates on GFX9
4787 if (!MO.isReg()) {
4788 ErrInfo =
4789 "Only reg allowed as operands in SDWA instructions on GFX9+";
4790 return false;
4791 }
4792 }
4793 }
4794
4795 if (!ST.hasSDWAOmod()) {
4796 // No omod allowed on VI
4797 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4798 if (OMod != nullptr &&
4799 (!OMod->isImm() || OMod->getImm() != 0)) {
4800 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4801 return false;
4802 }
4803 }
4804
4805 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4806 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4807 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4808 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4809 const MachineOperand *Src0ModsMO =
4810 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4811 unsigned Mods = Src0ModsMO->getImm();
4812 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4813 Mods & SISrcMods::SEXT) {
4814 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4815 return false;
4816 }
4817 }
4818
4819 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4820 if (isVOPC(BasicOpcode)) {
4821 if (!ST.hasSDWASdst() && DstIdx != -1) {
4822 // Only vcc allowed as dst on VI for VOPC
4823 const MachineOperand &Dst = MI.getOperand(DstIdx);
4824 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4825 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4826 return false;
4827 }
4828 } else if (!ST.hasSDWAOutModsVOPC()) {
4829 // No clamp allowed on GFX9 for VOPC
4830 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4831 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4832 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4833 return false;
4834 }
4835
4836 // No omod allowed on GFX9 for VOPC
4837 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4838 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4839 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4840 return false;
4841 }
4842 }
4843 }
4844
4845 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4846 if (DstUnused && DstUnused->isImm() &&
4847 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4848 const MachineOperand &Dst = MI.getOperand(DstIdx);
4849 if (!Dst.isReg() || !Dst.isTied()) {
4850 ErrInfo = "Dst register should have tied register";
4851 return false;
4852 }
4853
4854 const MachineOperand &TiedMO =
4855 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4856 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4857 ErrInfo =
4858 "Dst register should be tied to implicit use of preserved register";
4859 return false;
4860 }
4861 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
4862 ErrInfo = "Dst register should use same physical register as preserved";
4863 return false;
4864 }
4865 }
4866 }
4867
4868 // Verify MIMG / VIMAGE / VSAMPLE
4869 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4870 // Ensure that the return type used is large enough for all the options
4871 // being used TFE/LWE require an extra result register.
4872 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4873 if (DMask) {
4874 uint64_t DMaskImm = DMask->getImm();
4875 uint32_t RegCount =
4876 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
4877 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4878 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4879 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4880
4881 // Adjust for packed 16 bit values
4882 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4883 RegCount = divideCeil(RegCount, 2);
4884
4885 // Adjust if using LWE or TFE
4886 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4887 RegCount += 1;
4888
4889 const uint32_t DstIdx =
4890 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4891 const MachineOperand &Dst = MI.getOperand(DstIdx);
4892 if (Dst.isReg()) {
4893 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4894 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4895 if (RegCount > DstSize) {
4896 ErrInfo = "Image instruction returns too many registers for dst "
4897 "register class";
4898 return false;
4899 }
4900 }
4901 }
4902 }
4903
4904 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4905 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
4906 unsigned ConstantBusCount = 0;
4907 bool UsesLiteral = false;
4908 const MachineOperand *LiteralVal = nullptr;
4909
4910 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
4911 if (ImmIdx != -1) {
4912 ++ConstantBusCount;
4913 UsesLiteral = true;
4914 LiteralVal = &MI.getOperand(ImmIdx);
4915 }
4916
4917 SmallVector<Register, 2> SGPRsUsed;
4918 Register SGPRUsed;
4919
4920 // Only look at the true operands. Only a real operand can use the constant
4921 // bus, and we don't want to check pseudo-operands like the source modifier
4922 // flags.
4923 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
4924 if (OpIdx == -1)
4925 continue;
4926 const MachineOperand &MO = MI.getOperand(OpIdx);
4927 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4928 if (MO.isReg()) {
4929 SGPRUsed = MO.getReg();
4930 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
4931 ++ConstantBusCount;
4932 SGPRsUsed.push_back(SGPRUsed);
4933 }
4934 } else {
4935 if (!UsesLiteral) {
4936 ++ConstantBusCount;
4937 UsesLiteral = true;
4938 LiteralVal = &MO;
4939 } else if (!MO.isIdenticalTo(*LiteralVal)) {
4940 assert(isVOP2(MI) || isVOP3(MI));
4941 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
4942 return false;
4943 }
4944 }
4945 }
4946 }
4947
4948 SGPRUsed = findImplicitSGPRRead(MI);
4949 if (SGPRUsed) {
4950 // Implicit uses may safely overlap true operands
4951 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4952 return !RI.regsOverlap(SGPRUsed, SGPR);
4953 })) {
4954 ++ConstantBusCount;
4955 SGPRsUsed.push_back(SGPRUsed);
4956 }
4957 }
4958
4959 // v_writelane_b32 is an exception from constant bus restriction:
4960 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4961 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4962 Opcode != AMDGPU::V_WRITELANE_B32) {
4963 ErrInfo = "VOP* instruction violates constant bus restriction";
4964 return false;
4965 }
4966
4967 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4968 ErrInfo = "VOP3 instruction uses literal";
4969 return false;
4970 }
4971 }
4972
4973 // Special case for writelane - this can break the multiple constant bus rule,
4974 // but still can't use more than one SGPR register
4975 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4976 unsigned SGPRCount = 0;
4977 Register SGPRUsed;
4978
4979 for (int OpIdx : {Src0Idx, Src1Idx}) {
4980 if (OpIdx == -1)
4981 break;
4982
4983 const MachineOperand &MO = MI.getOperand(OpIdx);
4984
4985 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4986 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4987 if (MO.getReg() != SGPRUsed)
4988 ++SGPRCount;
4989 SGPRUsed = MO.getReg();
4990 }
4991 }
4992 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
4993 ErrInfo = "WRITELANE instruction violates constant bus restriction";
4994 return false;
4995 }
4996 }
4997 }
4998
4999 // Verify misc. restrictions on specific instructions.
5000 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5001 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5002 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5003 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5004 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5005 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5006 if (!compareMachineOp(Src0, Src1) &&
5007 !compareMachineOp(Src0, Src2)) {
5008 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5009 return false;
5010 }
5011 }
5012 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5013 SISrcMods::ABS) ||
5014 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5015 SISrcMods::ABS) ||
5016 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5017 SISrcMods::ABS)) {
5018 ErrInfo = "ABS not allowed in VOP3B instructions";
5019 return false;
5020 }
5021 }
5022
5023 if (isSOP2(MI) || isSOPC(MI)) {
5024 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5025 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5026
5027 if (!Src0.isReg() && !Src1.isReg() &&
5028 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5029 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5030 !Src0.isIdenticalTo(Src1)) {
5031 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5032 return false;
5033 }
5034 }
5035
5036 if (isSOPK(MI)) {
5037 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5038 if (Desc.isBranch()) {
5039 if (!Op->isMBB()) {
5040 ErrInfo = "invalid branch target for SOPK instruction";
5041 return false;
5042 }
5043 } else {
5044 uint64_t Imm = Op->getImm();
5045 if (sopkIsZext(Opcode)) {
5046 if (!isUInt<16>(Imm)) {
5047 ErrInfo = "invalid immediate for SOPK instruction";
5048 return false;
5049 }
5050 } else {
5051 if (!isInt<16>(Imm)) {
5052 ErrInfo = "invalid immediate for SOPK instruction";
5053 return false;
5054 }
5055 }
5056 }
5057 }
5058
5059 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5060 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5061 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5062 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5063 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5064 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5065
5066 const unsigned StaticNumOps =
5067 Desc.getNumOperands() + Desc.implicit_uses().size();
5068 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5069
5070 // Allow additional implicit operands. This allows a fixup done by the post
5071 // RA scheduler where the main implicit operand is killed and implicit-defs
5072 // are added for sub-registers that remain live after this instruction.
5073 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5074 ErrInfo = "missing implicit register operands";
5075 return false;
5076 }
5077
5078 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5079 if (IsDst) {
5080 if (!Dst->isUse()) {
5081 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5082 return false;
5083 }
5084
5085 unsigned UseOpIdx;
5086 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5087 UseOpIdx != StaticNumOps + 1) {
5088 ErrInfo = "movrel implicit operands should be tied";
5089 return false;
5090 }
5091 }
5092
5093 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5094 const MachineOperand &ImpUse
5095 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5096 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5097 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5098 ErrInfo = "src0 should be subreg of implicit vector use";
5099 return false;
5100 }
5101 }
5102
5103 // Make sure we aren't losing exec uses in the td files. This mostly requires
5104 // being careful when using let Uses to try to add other use registers.
5105 if (shouldReadExec(MI)) {
5106 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5107 ErrInfo = "VALU instruction does not implicitly read exec mask";
5108 return false;
5109 }
5110 }
5111
5112 if (isSMRD(MI)) {
5113 if (MI.mayStore() &&
5115 // The register offset form of scalar stores may only use m0 as the
5116 // soffset register.
5117 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5118 if (Soff && Soff->getReg() != AMDGPU::M0) {
5119 ErrInfo = "scalar stores must use m0 as offset register";
5120 return false;
5121 }
5122 }
5123 }
5124
5125 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5126 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5127 if (Offset->getImm() != 0) {
5128 ErrInfo = "subtarget does not support offsets in flat instructions";
5129 return false;
5130 }
5131 }
5132
5133 if (isDS(MI) && !ST.hasGDS()) {
5134 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5135 if (GDSOp && GDSOp->getImm() != 0) {
5136 ErrInfo = "GDS is not supported on this subtarget";
5137 return false;
5138 }
5139 }
5140
5141 if (isImage(MI)) {
5142 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5143 if (DimOp) {
5144 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5145 AMDGPU::OpName::vaddr0);
5146 int RSrcOpName =
5147 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5148 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5149 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5150 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5152 const AMDGPU::MIMGDimInfo *Dim =
5154
5155 if (!Dim) {
5156 ErrInfo = "dim is out of range";
5157 return false;
5158 }
5159
5160 bool IsA16 = false;
5161 if (ST.hasR128A16()) {
5162 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5163 IsA16 = R128A16->getImm() != 0;
5164 } else if (ST.hasA16()) {
5165 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5166 IsA16 = A16->getImm() != 0;
5167 }
5168
5169 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5170
5171 unsigned AddrWords =
5172 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5173
5174 unsigned VAddrWords;
5175 if (IsNSA) {
5176 VAddrWords = RsrcIdx - VAddr0Idx;
5177 if (ST.hasPartialNSAEncoding() &&
5178 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5179 unsigned LastVAddrIdx = RsrcIdx - 1;
5180 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5181 }
5182 } else {
5183 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5184 if (AddrWords > 12)
5185 AddrWords = 16;
5186 }
5187
5188 if (VAddrWords != AddrWords) {
5189 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5190 << " but got " << VAddrWords << "\n");
5191 ErrInfo = "bad vaddr size";
5192 return false;
5193 }
5194 }
5195 }
5196
5197 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5198 if (DppCt) {
5199 using namespace AMDGPU::DPP;
5200
5201 unsigned DC = DppCt->getImm();
5202 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5203 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5204 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5205 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5206 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5207 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5208 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5209 ErrInfo = "Invalid dpp_ctrl value";
5210 return false;
5211 }
5212 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5214 ErrInfo = "Invalid dpp_ctrl value: "
5215 "wavefront shifts are not supported on GFX10+";
5216 return false;
5217 }
5218 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5220 ErrInfo = "Invalid dpp_ctrl value: "
5221 "broadcasts are not supported on GFX10+";
5222 return false;
5223 }
5224 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5226 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5227 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5228 !ST.hasGFX90AInsts()) {
5229 ErrInfo = "Invalid dpp_ctrl value: "
5230 "row_newbroadcast/row_share is not supported before "
5231 "GFX90A/GFX10";
5232 return false;
5233 }
5234 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5235 ErrInfo = "Invalid dpp_ctrl value: "
5236 "row_share and row_xmask are not supported before GFX10";
5237 return false;
5238 }
5239 }
5240
5241 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5243 ErrInfo = "Invalid dpp_ctrl value: "
5244 "DP ALU dpp only support row_newbcast";
5245 return false;
5246 }
5247 }
5248
5249 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5250 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5251 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5252 : AMDGPU::OpName::vdata;
5253 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5254 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5255 if (Data && !Data->isReg())
5256 Data = nullptr;
5257
5258 if (ST.hasGFX90AInsts()) {
5259 if (Dst && Data &&
5260 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5261 ErrInfo = "Invalid register class: "
5262 "vdata and vdst should be both VGPR or AGPR";
5263 return false;
5264 }
5265 if (Data && Data2 &&
5266 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5267 ErrInfo = "Invalid register class: "
5268 "both data operands should be VGPR or AGPR";
5269 return false;
5270 }
5271 } else {
5272 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5273 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5274 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5275 ErrInfo = "Invalid register class: "
5276 "agpr loads and stores not supported on this GPU";
5277 return false;
5278 }
5279 }
5280 }
5281
5282 if (ST.needsAlignedVGPRs()) {
5283 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5285 if (!Op)
5286 return true;
5287 Register Reg = Op->getReg();
5288 if (Reg.isPhysical())
5289 return !(RI.getHWRegIndex(Reg) & 1);
5290 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5291 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5292 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5293 };
5294
5295 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5296 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5297 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5298
5299 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5300 ErrInfo = "Subtarget requires even aligned vector registers "
5301 "for DS_GWS instructions";
5302 return false;
5303 }
5304 }
5305
5306 if (isMIMG(MI)) {
5307 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5308 ErrInfo = "Subtarget requires even aligned vector registers "
5309 "for vaddr operand of image instructions";
5310 return false;
5311 }
5312 }
5313 }
5314
5315 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5316 !ST.hasGFX90AInsts()) {
5317 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5318 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5319 ErrInfo = "Invalid register class: "
5320 "v_accvgpr_write with an SGPR is not supported on this GPU";
5321 return false;
5322 }
5323 }
5324
5325 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5326 const MachineOperand &SrcOp = MI.getOperand(1);
5327 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5328 ErrInfo = "pseudo expects only physical SGPRs";
5329 return false;
5330 }
5331 }
5332
5333 return true;
5334}
5335
5336// It is more readable to list mapped opcodes on the same line.
5337// clang-format off
5338
5340 switch (MI.getOpcode()) {
5341 default: return AMDGPU::INSTRUCTION_LIST_END;
5342 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5343 case AMDGPU::COPY: return AMDGPU::COPY;
5344 case AMDGPU::PHI: return AMDGPU::PHI;
5345 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5346 case AMDGPU::WQM: return AMDGPU::WQM;
5347 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5348 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5349 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5350 case AMDGPU::S_MOV_B32: {
5351 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5352 return MI.getOperand(1).isReg() ||
5353 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5354 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5355 }
5356 case AMDGPU::S_ADD_I32:
5357 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5358 case AMDGPU::S_ADDC_U32:
5359 return AMDGPU::V_ADDC_U32_e32;
5360 case AMDGPU::S_SUB_I32:
5361 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5362 // FIXME: These are not consistently handled, and selected when the carry is
5363 // used.
5364 case AMDGPU::S_ADD_U32:
5365 return AMDGPU::V_ADD_CO_U32_e32;
5366 case AMDGPU::S_SUB_U32:
5367 return AMDGPU::V_SUB_CO_U32_e32;
5368 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5369 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5370 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5371 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5372 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5373 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5374 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5375 case AMDGPU::S_XNOR_B32:
5376 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5377 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5378 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5379 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5380 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5381 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5382 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5383 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5384 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5385 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5386 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5387 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5388 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5389 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5390 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5391 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5392 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5393 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5394 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5395 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5396 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5397 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5398 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5399 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5400 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5401 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5402 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5403 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5404 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5405 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5406 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5407 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5408 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5409 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5410 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5411 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5412 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5413 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5414 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5415 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5416 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5417 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5418 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5419 case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5420 case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5421 case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64;
5422 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5423 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5424 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5425 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5426 case AMDGPU::S_CEIL_F16:
5427 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5428 : AMDGPU::V_CEIL_F16_fake16_e64;
5429 case AMDGPU::S_FLOOR_F16:
5430 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5431 : AMDGPU::V_FLOOR_F16_fake16_e64;
5432 case AMDGPU::S_TRUNC_F16:
5433 return AMDGPU::V_TRUNC_F16_fake16_e64;
5434 case AMDGPU::S_RNDNE_F16:
5435 return AMDGPU::V_RNDNE_F16_fake16_e64;
5436 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5437 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5438 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5439 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5440 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5441 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5442 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5443 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5444 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5445 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5446 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5447 case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5448 case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5449 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5450 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5451 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5452 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
5453 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5454 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5455 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5456 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5457 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5458 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5459 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5460 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5461 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5462 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5463 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5464 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5465 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5466 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5467 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5468 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5469 case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64;
5470 case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64;
5471 case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64;
5472 case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64;
5473 case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64;
5474 case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64;
5475 case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64;
5476 case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64;
5477 case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64;
5478 case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64;
5479 case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64;
5480 case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
5481 case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
5482 case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
5483 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5484 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5485 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5486 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5487 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5488 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5489 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5490 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5491 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5492 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5493 }
5495 "Unexpected scalar opcode without corresponding vector one!");
5496}
5497
5498// clang-format on
5499
5503 const DebugLoc &DL, Register Reg,
5504 bool IsSCCLive,
5505 SlotIndexes *Indexes) const {
5506 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5507 const SIInstrInfo *TII = ST.getInstrInfo();
5508 bool IsWave32 = ST.isWave32();
5509 if (IsSCCLive) {
5510 // Insert two move instructions, one to save the original value of EXEC and
5511 // the other to turn on all bits in EXEC. This is required as we can't use
5512 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5513 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5514 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5515 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5516 .addReg(Exec, RegState::Kill);
5517 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5518 if (Indexes) {
5519 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5520 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5521 }
5522 } else {
5523 const unsigned OrSaveExec =
5524 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5525 auto SaveExec =
5526 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5527 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5528 if (Indexes)
5529 Indexes->insertMachineInstrInMaps(*SaveExec);
5530 }
5531}
5532
5535 const DebugLoc &DL, Register Reg,
5536 SlotIndexes *Indexes) const {
5537 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5538 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5539 auto ExecRestoreMI =
5540 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5541 if (Indexes)
5542 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5543}
5544
5545static const TargetRegisterClass *
5547 const MachineRegisterInfo &MRI,
5548 const MCInstrDesc &TID, unsigned RCID,
5549 bool IsAllocatable) {
5550 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5551 (((TID.mayLoad() || TID.mayStore()) &&
5552 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5554 switch (RCID) {
5555 case AMDGPU::AV_32RegClassID:
5556 RCID = AMDGPU::VGPR_32RegClassID;
5557 break;
5558 case AMDGPU::AV_64RegClassID:
5559 RCID = AMDGPU::VReg_64RegClassID;
5560 break;
5561 case AMDGPU::AV_96RegClassID:
5562 RCID = AMDGPU::VReg_96RegClassID;
5563 break;
5564 case AMDGPU::AV_128RegClassID:
5565 RCID = AMDGPU::VReg_128RegClassID;
5566 break;
5567 case AMDGPU::AV_160RegClassID:
5568 RCID = AMDGPU::VReg_160RegClassID;
5569 break;
5570 case AMDGPU::AV_512RegClassID:
5571 RCID = AMDGPU::VReg_512RegClassID;
5572 break;
5573 default:
5574 break;
5575 }
5576 }
5577
5578 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5579}
5580
5582 unsigned OpNum, const TargetRegisterInfo *TRI,
5583 const MachineFunction &MF)
5584 const {
5585 if (OpNum >= TID.getNumOperands())
5586 return nullptr;
5587 auto RegClass = TID.operands()[OpNum].RegClass;
5588 bool IsAllocatable = false;
5590 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5591 // with two data operands. Request register class constrained to VGPR only
5592 // of both operands present as Machine Copy Propagation can not check this
5593 // constraint and possibly other passes too.
5594 //
5595 // The check is limited to FLAT and DS because atomics in non-flat encoding
5596 // have their vdst and vdata tied to be the same register.
5597 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5598 AMDGPU::OpName::vdst);
5599 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5600 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5601 : AMDGPU::OpName::vdata);
5602 if (DataIdx != -1) {
5603 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5604 TID.Opcode, AMDGPU::OpName::data1);
5605 }
5606 }
5607 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5608 IsAllocatable);
5609}
5610
5612 unsigned OpNo) const {
5613 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5614 const MCInstrDesc &Desc = get(MI.getOpcode());
5615 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5616 Desc.operands()[OpNo].RegClass == -1) {
5617 Register Reg = MI.getOperand(OpNo).getReg();
5618
5619 if (Reg.isVirtual())
5620 return MRI.getRegClass(Reg);
5621 return RI.getPhysRegBaseClass(Reg);
5622 }
5623
5624 unsigned RCID = Desc.operands()[OpNo].RegClass;
5625 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5626}
5627
5630 MachineBasicBlock *MBB = MI.getParent();
5631 MachineOperand &MO = MI.getOperand(OpIdx);
5633 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5634 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5635 unsigned Size = RI.getRegSizeInBits(*RC);
5636 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
5637 if (MO.isReg())
5638 Opcode = AMDGPU::COPY;
5639 else if (RI.isSGPRClass(RC))
5640 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5641
5642 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5643 Register Reg = MRI.createVirtualRegister(VRC);
5645 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5646 MO.ChangeToRegister(Reg, false);
5647}
5648
5651 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5652 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5653 MachineBasicBlock *MBB = MI->getParent();
5654 DebugLoc DL = MI->getDebugLoc();
5655 Register SubReg = MRI.createVirtualRegister(SubRC);
5656
5657 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
5658 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5659 .addReg(SuperReg.getReg(), 0, NewSubIdx);
5660 return SubReg;
5661}
5662
5665 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5666 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5667 if (Op.isImm()) {
5668 if (SubIdx == AMDGPU::sub0)
5669 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5670 if (SubIdx == AMDGPU::sub1)
5671 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5672
5673 llvm_unreachable("Unhandled register index for immediate");
5674 }
5675
5676 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5677 SubIdx, SubRC);
5678 return MachineOperand::CreateReg(SubReg, false);
5679}
5680
5681// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5682void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5683 assert(Inst.getNumExplicitOperands() == 3);
5684 MachineOperand Op1 = Inst.getOperand(1);
5685 Inst.removeOperand(1);
5686 Inst.addOperand(Op1);
5687}
5688
5690 const MCOperandInfo &OpInfo,
5691 const MachineOperand &MO) const {
5692 if (!MO.isReg())
5693 return false;
5694
5695 Register Reg = MO.getReg();
5696
5697 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5698 if (Reg.isPhysical())
5699 return DRC->contains(Reg);
5700
5701 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5702
5703 if (MO.getSubReg()) {
5704 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5705 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5706 if (!SuperRC)
5707 return false;
5708
5709 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5710 if (!DRC)
5711 return false;
5712 }
5713 return RC->hasSuperClassEq(DRC);
5714}
5715
5717 const MCOperandInfo &OpInfo,
5718 const MachineOperand &MO) const {
5719 if (MO.isReg())
5720 return isLegalRegOperand(MRI, OpInfo, MO);
5721
5722 // Handle non-register types that are treated like immediates.
5723 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5724 return true;
5725}
5726
5727bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5728 const MachineOperand *MO) const {
5729 const MachineFunction &MF = *MI.getParent()->getParent();
5730 const MachineRegisterInfo &MRI = MF.getRegInfo();
5731 const MCInstrDesc &InstDesc = MI.getDesc();
5732 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5733 const TargetRegisterClass *DefinedRC =
5734 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5735 if (!MO)
5736 MO = &MI.getOperand(OpIdx);
5737
5738 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5739 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5740 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5741 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5742 return false;
5743
5745 if (MO->isReg())
5746 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5747
5748 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5749 if (i == OpIdx)
5750 continue;
5751 const MachineOperand &Op = MI.getOperand(i);
5752 if (Op.isReg()) {
5753 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5754 if (!SGPRsUsed.count(SGPR) &&
5755 // FIXME: This can access off the end of the operands() array.
5756 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5757 if (--ConstantBusLimit <= 0)
5758 return false;
5759 SGPRsUsed.insert(SGPR);
5760 }
5761 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5762 !isInlineConstant(Op, InstDesc.operands()[i])) {
5763 if (!LiteralLimit--)
5764 return false;
5765 if (--ConstantBusLimit <= 0)
5766 return false;
5767 }
5768 }
5769 }
5770
5771 if (MO->isReg()) {
5772 if (!DefinedRC)
5773 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5774 if (!isLegalRegOperand(MRI, OpInfo, *MO))
5775 return false;
5776 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5777 if (IsAGPR && !ST.hasMAIInsts())
5778 return false;
5779 unsigned Opc = MI.getOpcode();
5780 if (IsAGPR &&
5781 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5782 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5783 return false;
5784 // Atomics should have both vdst and vdata either vgpr or agpr.
5785 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5786 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
5787 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5788 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5789 MI.getOperand(DataIdx).isReg() &&
5790 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5791 return false;
5792 if ((int)OpIdx == DataIdx) {
5793 if (VDstIdx != -1 &&
5794 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5795 return false;
5796 // DS instructions with 2 src operands also must have tied RC.
5797 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5798 AMDGPU::OpName::data1);
5799 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5800 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5801 return false;
5802 }
5803 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5804 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5805 RI.isSGPRReg(MRI, MO->getReg()))
5806 return false;
5807 return true;
5808 }
5809
5810 if (MO->isImm()) {
5811 uint64_t Imm = MO->getImm();
5812 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5813 bool Is64BitOp = Is64BitFPOp ||
5817 if (Is64BitOp &&
5819 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
5820 return false;
5821
5822 // FIXME: We can use sign extended 64-bit literals, but only for signed
5823 // operands. At the moment we do not know if an operand is signed.
5824 // Such operand will be encoded as its low 32 bits and then either
5825 // correctly sign extended or incorrectly zero extended by HW.
5826 if (!Is64BitFPOp && (int32_t)Imm < 0)
5827 return false;
5828 }
5829 }
5830
5831 // Handle non-register types that are treated like immediates.
5832 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
5833
5834 if (!DefinedRC) {
5835 // This operand expects an immediate.
5836 return true;
5837 }
5838
5839 return isImmOperandLegal(MI, OpIdx, *MO);
5840}
5841
5843 MachineInstr &MI) const {
5844 unsigned Opc = MI.getOpcode();
5845 const MCInstrDesc &InstrDesc = get(Opc);
5846
5847 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5848 MachineOperand &Src0 = MI.getOperand(Src0Idx);
5849
5850 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5851 MachineOperand &Src1 = MI.getOperand(Src1Idx);
5852
5853 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5854 // we need to only have one constant bus use before GFX10.
5855 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
5856 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
5857 RI.isSGPRReg(MRI, Src0.getReg()))
5858 legalizeOpWithMove(MI, Src0Idx);
5859
5860 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5861 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5862 // src0/src1 with V_READFIRSTLANE.
5863 if (Opc == AMDGPU::V_WRITELANE_B32) {
5864 const DebugLoc &DL = MI.getDebugLoc();
5865 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
5866 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5867 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5868 .add(Src0);
5869 Src0.ChangeToRegister(Reg, false);
5870 }
5871 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
5872 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5873 const DebugLoc &DL = MI.getDebugLoc();
5874 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5875 .add(Src1);
5876 Src1.ChangeToRegister(Reg, false);
5877 }
5878 return;
5879 }
5880
5881 // No VOP2 instructions support AGPRs.
5882 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
5883 legalizeOpWithMove(MI, Src0Idx);
5884
5885 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
5886 legalizeOpWithMove(MI, Src1Idx);
5887
5888 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
5889 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
5890 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5891 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
5892 legalizeOpWithMove(MI, Src2Idx);
5893 }
5894
5895 // VOP2 src0 instructions support all operand types, so we don't need to check
5896 // their legality. If src1 is already legal, we don't need to do anything.
5897 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
5898 return;
5899
5900 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
5901 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
5902 // select is uniform.
5903 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
5904 RI.isVGPR(MRI, Src1.getReg())) {
5905 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5906 const DebugLoc &DL = MI.getDebugLoc();
5907 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5908 .add(Src1);
5909 Src1.ChangeToRegister(Reg, false);
5910 return;
5911 }
5912
5913 // We do not use commuteInstruction here because it is too aggressive and will
5914 // commute if it is possible. We only want to commute here if it improves
5915 // legality. This can be called a fairly large number of times so don't waste
5916 // compile time pointlessly swapping and checking legality again.
5917 if (HasImplicitSGPR || !MI.isCommutable()) {
5918 legalizeOpWithMove(MI, Src1Idx);
5919 return;
5920 }
5921
5922 // If src0 can be used as src1, commuting will make the operands legal.
5923 // Otherwise we have to give up and insert a move.
5924 //
5925 // TODO: Other immediate-like operand kinds could be commuted if there was a
5926 // MachineOperand::ChangeTo* for them.
5927 if ((!Src1.isImm() && !Src1.isReg()) ||
5928 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
5929 legalizeOpWithMove(MI, Src1Idx);
5930 return;
5931 }
5932
5933 int CommutedOpc = commuteOpcode(MI);
5934 if (CommutedOpc == -1) {
5935 legalizeOpWithMove(MI, Src1Idx);
5936 return;
5937 }
5938
5939 MI.setDesc(get(CommutedOpc));
5940
5941 Register Src0Reg = Src0.getReg();
5942 unsigned Src0SubReg = Src0.getSubReg();
5943 bool Src0Kill = Src0.isKill();
5944
5945 if (Src1.isImm())
5946 Src0.ChangeToImmediate(Src1.getImm());
5947 else if (Src1.isReg()) {
5948 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
5949 Src0.setSubReg(Src1.getSubReg());
5950 } else
5951 llvm_unreachable("Should only have register or immediate operands");
5952
5953 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
5954 Src1.setSubReg(Src0SubReg);
5956}
5957
5958// Legalize VOP3 operands. All operand types are supported for any operand
5959// but only one literal constant and only starting from GFX10.
5961 MachineInstr &MI) const {
5962 unsigned Opc = MI.getOpcode();
5963
5964 int VOP3Idx[3] = {
5965 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
5966 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
5967 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
5968 };
5969
5970 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
5971 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
5972 // src1 and src2 must be scalar
5973 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
5974 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
5975 const DebugLoc &DL = MI.getDebugLoc();
5976 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
5977 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5978 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5979 .add(Src1);
5980 Src1.ChangeToRegister(Reg, false);
5981 }
5982 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
5983 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5984 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5985 .add(Src2);
5986 Src2.ChangeToRegister(Reg, false);
5987 }
5988 }
5989
5990 // Find the one SGPR operand we are allowed to use.
5991 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
5992 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
5993 SmallDenseSet<unsigned> SGPRsUsed;
5994 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
5995 if (SGPRReg) {
5996 SGPRsUsed.insert(SGPRReg);
5997 --ConstantBusLimit;
5998 }
5999
6000 for (int Idx : VOP3Idx) {
6001 if (Idx == -1)
6002 break;
6003 MachineOperand &MO = MI.getOperand(Idx);
6004
6005 if (!MO.isReg()) {
6006 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6007 continue;
6008
6009 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6010 --LiteralLimit;
6011 --ConstantBusLimit;
6012 continue;
6013 }
6014
6015 --LiteralLimit;
6016 --ConstantBusLimit;
6018 continue;
6019 }
6020
6021 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6022 !isOperandLegal(MI, Idx, &MO)) {
6024 continue;
6025 }
6026
6027 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6028 continue; // VGPRs are legal
6029
6030 // We can use one SGPR in each VOP3 instruction prior to GFX10
6031 // and two starting from GFX10.
6032 if (SGPRsUsed.count(MO.getReg()))
6033 continue;
6034 if (ConstantBusLimit > 0) {
6035 SGPRsUsed.insert(MO.getReg());
6036 --ConstantBusLimit;
6037 continue;
6038 }
6039
6040 // If we make it this far, then the operand is not legal and we must
6041 // legalize it.
6043 }
6044
6045 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6046 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6047 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6048 legalizeOpWithMove(MI, VOP3Idx[2]);
6049}
6050
6052 MachineRegisterInfo &MRI) const {
6053 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6054 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6055 Register DstReg = MRI.createVirtualRegister(SRC);
6056 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6057
6058 if (RI.hasAGPRs(VRC)) {
6059 VRC = RI.getEquivalentVGPRClass(VRC);
6060 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6061 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6062 get(TargetOpcode::COPY), NewSrcReg)
6063 .addReg(SrcReg);
6064 SrcReg = NewSrcReg;
6065 }
6066
6067 if (SubRegs == 1) {
6068 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6069 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6070 .addReg(SrcReg);
6071 return DstReg;
6072 }
6073
6075 for (unsigned i = 0; i < SubRegs; ++i) {
6076 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6077 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6078 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6079 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6080 SRegs.push_back(SGPR);
6081 }
6082
6084 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6085 get(AMDGPU::REG_SEQUENCE), DstReg);
6086 for (unsigned i = 0; i < SubRegs; ++i) {
6087 MIB.addReg(SRegs[i]);
6088 MIB.addImm(RI.getSubRegFromChannel(i));
6089 }
6090 return DstReg;
6091}
6092
6094 MachineInstr &MI) const {
6095
6096 // If the pointer is store in VGPRs, then we need to move them to
6097 // SGPRs using v_readfirstlane. This is safe because we only select
6098 // loads with uniform pointers to SMRD instruction so we know the
6099 // pointer value is uniform.
6100 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6101 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6102 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6103 SBase->setReg(SGPR);
6104 }
6105 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6106 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
6107 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6108 SOff->setReg(SGPR);
6109 }
6110}
6111
6113 unsigned Opc = Inst.getOpcode();
6114 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6115 if (OldSAddrIdx < 0)
6116 return false;
6117
6119
6120 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6121 if (NewOpc < 0)
6123 if (NewOpc < 0)
6124 return false;
6125
6127 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6128 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6129 return false;
6130
6131 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6132 if (NewVAddrIdx < 0)
6133 return false;
6134
6135 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6136
6137 // Check vaddr, it shall be zero or absent.
6138 MachineInstr *VAddrDef = nullptr;
6139 if (OldVAddrIdx >= 0) {
6140 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6141 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6142 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6143 !VAddrDef->getOperand(1).isImm() ||
6144 VAddrDef->getOperand(1).getImm() != 0)
6145 return false;
6146 }
6147
6148 const MCInstrDesc &NewDesc = get(NewOpc);
6149 Inst.setDesc(NewDesc);
6150
6151 // Callers expect iterator to be valid after this call, so modify the
6152 // instruction in place.
6153 if (OldVAddrIdx == NewVAddrIdx) {
6154 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6155 // Clear use list from the old vaddr holding a zero register.
6156 MRI.removeRegOperandFromUseList(&NewVAddr);
6157 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6158 Inst.removeOperand(OldSAddrIdx);
6159 // Update the use list with the pointer we have just moved from vaddr to
6160 // saddr position. Otherwise new vaddr will be missing from the use list.
6161 MRI.removeRegOperandFromUseList(&NewVAddr);
6162 MRI.addRegOperandToUseList(&NewVAddr);
6163 } else {
6164 assert(OldSAddrIdx == NewVAddrIdx);
6165
6166 if (OldVAddrIdx >= 0) {
6167 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6168 AMDGPU::OpName::vdst_in);
6169
6170 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6171 // it asserts. Untie the operands for now and retie them afterwards.
6172 if (NewVDstIn != -1) {
6173 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6174 Inst.untieRegOperand(OldVDstIn);
6175 }
6176
6177 Inst.removeOperand(OldVAddrIdx);
6178
6179 if (NewVDstIn != -1) {
6180 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6181 Inst.tieOperands(NewVDst, NewVDstIn);
6182 }
6183 }
6184 }
6185
6186 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6187 VAddrDef->eraseFromParent();
6188
6189 return true;
6190}
6191
6192// FIXME: Remove this when SelectionDAG is obsoleted.
6194 MachineInstr &MI) const {
6196 return;
6197
6198 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6199 // thinks they are uniform, so a readfirstlane should be valid.
6200 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6201 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6202 return;
6203
6205 return;
6206
6207 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
6208 SAddr->setReg(ToSGPR);
6209}
6210
6213 const TargetRegisterClass *DstRC,
6216 const DebugLoc &DL) const {
6217 Register OpReg = Op.getReg();
6218 unsigned OpSubReg = Op.getSubReg();
6219
6220 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6221 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6222
6223 // Check if operand is already the correct register class.
6224 if (DstRC == OpRC)
6225 return;
6226
6227 Register DstReg = MRI.createVirtualRegister(DstRC);
6228 auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
6229
6230 Op.setReg(DstReg);
6231 Op.setSubReg(0);
6232
6233 MachineInstr *Def = MRI.getVRegDef(OpReg);
6234 if (!Def)
6235 return;
6236
6237 // Try to eliminate the copy if it is copying an immediate value.
6238 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6239 foldImmediate(*Copy, *Def, OpReg, &MRI);
6240
6241 bool ImpDef = Def->isImplicitDef();
6242 while (!ImpDef && Def && Def->isCopy()) {
6243 if (Def->getOperand(1).getReg().isPhysical())
6244 break;
6245 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6246 ImpDef = Def && Def->isImplicitDef();
6247 }
6248 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6249 !ImpDef)
6250 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6251}
6252
6253// Emit the actual waterfall loop, executing the wrapped instruction for each
6254// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6255// iteration, in the worst case we execute 64 (once per lane).
6258 MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL,
6259 ArrayRef<MachineOperand *> ScalarOps) {
6260 MachineFunction &MF = *OrigBB.getParent();
6261 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6262 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6263 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6264 unsigned SaveExecOpc =
6265 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6266 unsigned XorTermOpc =
6267 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6268 unsigned AndOpc =
6269 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6270 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6271
6273
6274 SmallVector<Register, 8> ReadlanePieces;
6275 Register CondReg;
6276
6277 for (MachineOperand *ScalarOp : ScalarOps) {
6278 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6279 unsigned NumSubRegs = RegSize / 32;
6280 Register VScalarOp = ScalarOp->getReg();
6281
6282 if (NumSubRegs == 1) {
6283 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6284
6285 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6286 .addReg(VScalarOp);
6287
6288 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6289
6290 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6291 .addReg(CurReg)
6292 .addReg(VScalarOp);
6293
6294 // Combine the comparison results with AND.
6295 if (!CondReg) // First.
6296 CondReg = NewCondReg;
6297 else { // If not the first, we create an AND.
6298 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6299 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6300 .addReg(CondReg)
6301 .addReg(NewCondReg);
6302 CondReg = AndReg;
6303 }
6304
6305 // Update ScalarOp operand to use the SGPR ScalarOp.
6306 ScalarOp->setReg(CurReg);
6307 ScalarOp->setIsKill();
6308 } else {
6309 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6310 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6311 "Unhandled register size");
6312
6313 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6314 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6315 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6316
6317 // Read the next variant <- also loop target.
6318 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6319 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6320
6321 // Read the next variant <- also loop target.
6322 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6323 .addReg(VScalarOp, VScalarOpUndef,
6324 TRI->getSubRegFromChannel(Idx + 1));
6325
6326 ReadlanePieces.push_back(CurRegLo);
6327 ReadlanePieces.push_back(CurRegHi);
6328
6329 // Comparison is to be done as 64-bit.
6330 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6331 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6332 .addReg(CurRegLo)
6333 .addImm(AMDGPU::sub0)
6334 .addReg(CurRegHi)
6335 .addImm(AMDGPU::sub1);
6336
6337 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6338 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6339 NewCondReg)
6340 .addReg(CurReg);
6341 if (NumSubRegs <= 2)
6342 Cmp.addReg(VScalarOp);
6343 else
6344 Cmp.addReg(VScalarOp, VScalarOpUndef,
6345 TRI->getSubRegFromChannel(Idx, 2));
6346
6347 // Combine the comparison results with AND.
6348 if (!CondReg) // First.
6349 CondReg = NewCondReg;
6350 else { // If not the first, we create an AND.
6351 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6352 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6353 .addReg(CondReg)
6354 .addReg(NewCondReg);
6355 CondReg = AndReg;
6356 }
6357 } // End for loop.
6358
6359 auto SScalarOpRC =
6360 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6361 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6362
6363 // Build scalar ScalarOp.
6364 auto Merge =
6365 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6366 unsigned Channel = 0;
6367 for (Register Piece : ReadlanePieces) {
6368 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6369 }
6370
6371 // Update ScalarOp operand to use the SGPR ScalarOp.
6372 ScalarOp->setReg(SScalarOp);
6373 ScalarOp->setIsKill();
6374 }
6375 }
6376
6377 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6378 MRI.setSimpleHint(SaveExec, CondReg);
6379
6380 // Update EXEC to matching lanes, saving original to SaveExec.
6381 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6382 .addReg(CondReg, RegState::Kill);
6383
6384 // The original instruction is here; we insert the terminators after it.
6385 I = BodyBB.end();
6386
6387 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6388 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6389 .addReg(Exec)
6390 .addReg(SaveExec);
6391
6392 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6393}
6394
6395// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6396// with SGPRs by iterating over all unique values across all lanes.
6397// Returns the loop basic block that now contains \p MI.
6398static MachineBasicBlock *
6402 MachineBasicBlock::iterator Begin = nullptr,
6403 MachineBasicBlock::iterator End = nullptr) {
6404 MachineBasicBlock &MBB = *MI.getParent();
6405 MachineFunction &MF = *MBB.getParent();
6406 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6407 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6409 if (!Begin.isValid())
6410 Begin = &MI;
6411 if (!End.isValid()) {
6412 End = &MI;
6413 ++End;
6414 }
6415 const DebugLoc &DL = MI.getDebugLoc();
6416 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6417 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6418 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6419
6420 // Save SCC. Waterfall Loop may overwrite SCC.
6421 Register SaveSCCReg;
6422
6423 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6424 // rather than unlimited scan everywhere
6425 bool SCCNotDead =
6426 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6427 std::numeric_limits<unsigned>::max()) !=
6429 if (SCCNotDead) {
6430 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6431 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6432 .addImm(1)
6433 .addImm(0);
6434 }
6435
6436 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6437
6438 // Save the EXEC mask
6439 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6440
6441 // Killed uses in the instruction we are waterfalling around will be
6442 // incorrect due to the added control-flow.
6444 ++AfterMI;
6445 for (auto I = Begin; I != AfterMI; I++) {
6446 for (auto &MO : I->all_uses())
6447 MRI.clearKillFlags(MO.getReg());
6448 }
6449
6450 // To insert the loop we need to split the block. Move everything after this
6451 // point to a new block, and insert a new empty block between the two.
6454 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6456 ++MBBI;
6457
6458 MF.insert(MBBI, LoopBB);
6459 MF.insert(MBBI, BodyBB);
6460 MF.insert(MBBI, RemainderBB);
6461
6462 LoopBB->addSuccessor(BodyBB);
6463 BodyBB->addSuccessor(LoopBB);
6464 BodyBB->addSuccessor(RemainderBB);
6465
6466 // Move Begin to MI to the BodyBB, and the remainder of the block to
6467 // RemainderBB.
6468 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6469 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6470 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6471
6472 MBB.addSuccessor(LoopBB);
6473
6474 // Update dominators. We know that MBB immediately dominates LoopBB, that
6475 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6476 // RemainderBB. RemainderBB immediately dominates all of the successors
6477 // transferred to it from MBB that MBB used to properly dominate.
6478 if (MDT) {
6479 MDT->addNewBlock(LoopBB, &MBB);
6480 MDT->addNewBlock(BodyBB, LoopBB);
6481 MDT->addNewBlock(RemainderBB, BodyBB);
6482 for (auto &Succ : RemainderBB->successors()) {
6483 if (MDT->properlyDominates(&MBB, Succ)) {
6484 MDT->changeImmediateDominator(Succ, RemainderBB);
6485 }
6486 }
6487 }
6488
6489 emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps);
6490
6491 MachineBasicBlock::iterator First = RemainderBB->begin();
6492 // Restore SCC
6493 if (SCCNotDead) {
6494 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6495 .addReg(SaveSCCReg, RegState::Kill)
6496 .addImm(0);
6497 }
6498
6499 // Restore the EXEC mask
6500 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6501 return BodyBB;
6502}
6503
6504// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6505static std::tuple<unsigned, unsigned>
6507 MachineBasicBlock &MBB = *MI.getParent();
6508 MachineFunction &MF = *MBB.getParent();
6510
6511 // Extract the ptr from the resource descriptor.
6512 unsigned RsrcPtr =
6513 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6514 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6515
6516 // Create an empty resource descriptor
6517 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6518 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6519 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6520 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6521 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6522
6523 // Zero64 = 0
6524 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6525 .addImm(0);
6526
6527 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6528 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6529 .addImm(RsrcDataFormat & 0xFFFFFFFF);
6530
6531 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6532 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6533 .addImm(RsrcDataFormat >> 32);
6534
6535 // NewSRsrc = {Zero64, SRsrcFormat}
6536 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6537 .addReg(Zero64)
6538 .addImm(AMDGPU::sub0_sub1)
6539 .addReg(SRsrcFormatLo)
6540 .addImm(AMDGPU::sub2)
6541 .addReg(SRsrcFormatHi)
6542 .addImm(AMDGPU::sub3);
6543
6544 return std::tuple(RsrcPtr, NewSRsrc);
6545}
6546
6549 MachineDominatorTree *MDT) const {
6550 MachineFunction &MF = *MI.getParent()->getParent();
6552 MachineBasicBlock *CreatedBB = nullptr;
6553
6554 // Legalize VOP2
6555 if (isVOP2(MI) || isVOPC(MI)) {
6557 return CreatedBB;
6558 }
6559
6560 // Legalize VOP3
6561 if (isVOP3(MI)) {
6563 return CreatedBB;
6564 }
6565
6566 // Legalize SMRD
6567 if (isSMRD(MI)) {
6569 return CreatedBB;
6570 }
6571
6572 // Legalize FLAT
6573 if (isFLAT(MI)) {
6575 return CreatedBB;
6576 }
6577
6578 // Legalize REG_SEQUENCE and PHI
6579 // The register class of the operands much be the same type as the register
6580 // class of the output.
6581 if (MI.getOpcode() == AMDGPU::PHI) {
6582 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6583 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6584 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6585 continue;
6586 const TargetRegisterClass *OpRC =
6587 MRI.getRegClass(MI.getOperand(i).getReg());
6588 if (RI.hasVectorRegisters(OpRC)) {
6589 VRC = OpRC;
6590 } else {
6591 SRC = OpRC;
6592 }
6593 }
6594
6595 // If any of the operands are VGPR registers, then they all most be
6596 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6597 // them.
6598 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6599 if (!VRC) {
6600 assert(SRC);
6601 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6602 VRC = &AMDGPU::VReg_1RegClass;
6603 } else
6604 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6605 ? RI.getEquivalentAGPRClass(SRC)
6606 : RI.getEquivalentVGPRClass(SRC);
6607 } else {
6608 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6609 ? RI.getEquivalentAGPRClass(VRC)
6610 : RI.getEquivalentVGPRClass(VRC);
6611 }
6612 RC = VRC;
6613 } else {
6614 RC = SRC;
6615 }
6616
6617 // Update all the operands so they have the same type.
6618 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6619 MachineOperand &Op = MI.getOperand(I);
6620 if (!Op.isReg() || !Op.getReg().isVirtual())
6621 continue;
6622
6623 // MI is a PHI instruction.
6624 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6626
6627 // Avoid creating no-op copies with the same src and dst reg class. These
6628 // confuse some of the machine passes.
6629 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6630 }
6631 }
6632
6633 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6634 // VGPR dest type and SGPR sources, insert copies so all operands are
6635 // VGPRs. This seems to help operand folding / the register coalescer.
6636 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6637 MachineBasicBlock *MBB = MI.getParent();
6638 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6639 if (RI.hasVGPRs(DstRC)) {
6640 // Update all the operands so they are VGPR register classes. These may
6641 // not be the same register class because REG_SEQUENCE supports mixing
6642 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6643 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6644 MachineOperand &Op = MI.getOperand(I);
6645 if (!Op.isReg() || !Op.getReg().isVirtual())
6646 continue;
6647
6648 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6649 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6650 if (VRC == OpRC)
6651 continue;
6652
6653 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6654 Op.setIsKill();
6655 }
6656 }
6657
6658 return CreatedBB;
6659 }
6660
6661 // Legalize INSERT_SUBREG
6662 // src0 must have the same register class as dst
6663 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6664 Register Dst = MI.getOperand(0).getReg();
6665 Register Src0 = MI.getOperand(1).getReg();
6666 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6667 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6668 if (DstRC != Src0RC) {
6669 MachineBasicBlock *MBB = MI.getParent();
6670 MachineOperand &Op = MI.getOperand(1);
6671 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6672 }
6673 return CreatedBB;
6674 }
6675
6676 // Legalize SI_INIT_M0
6677 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6678 MachineOperand &Src = MI.getOperand(0);
6679 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6680 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6681 return CreatedBB;
6682 }
6683
6684 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6685 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6686 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6687 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6688 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6689 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
6690 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
6691 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
6692 MachineOperand &Src = MI.getOperand(1);
6693 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6694 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6695 return CreatedBB;
6696 }
6697
6698 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6699 //
6700 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6701 // scratch memory access. In both cases, the legalization never involves
6702 // conversion to the addr64 form.
6704 (isMUBUF(MI) || isMTBUF(MI)))) {
6705 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6706 : AMDGPU::OpName::srsrc;
6707 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6708 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6709 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6710
6711 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6712 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6713 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6714 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6715
6716 return CreatedBB;
6717 }
6718
6719 // Legalize SI_CALL
6720 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6721 MachineOperand *Dest = &MI.getOperand(0);
6722 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6723 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6724 // following copies, we also need to move copies from and to physical
6725 // registers into the loop block.
6726 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6727 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6728
6729 // Also move the copies to physical registers into the loop block
6730 MachineBasicBlock &MBB = *MI.getParent();
6732 while (Start->getOpcode() != FrameSetupOpcode)
6733 --Start;
6735 while (End->getOpcode() != FrameDestroyOpcode)
6736 ++End;
6737 // Also include following copies of the return value
6738 ++End;
6739 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6740 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
6741 ++End;
6742 CreatedBB =
6743 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6744 }
6745 }
6746
6747 // Legalize s_sleep_var.
6748 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6749 const DebugLoc &DL = MI.getDebugLoc();
6750 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6751 int Src0Idx =
6752 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6753 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6754 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6755 .add(Src0);
6756 Src0.ChangeToRegister(Reg, false);
6757 return nullptr;
6758 }
6759
6760 // Legalize MUBUF instructions.
6761 bool isSoffsetLegal = true;
6762 int SoffsetIdx =
6763 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6764 if (SoffsetIdx != -1) {
6765 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6766 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6767 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6768 isSoffsetLegal = false;
6769 }
6770 }
6771
6772 bool isRsrcLegal = true;
6773 int RsrcIdx =
6774 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6775 if (RsrcIdx != -1) {
6776 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6777 if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
6778 isRsrcLegal = false;
6779 }
6780 }
6781
6782 // The operands are legal.
6783 if (isRsrcLegal && isSoffsetLegal)
6784 return CreatedBB;
6785
6786 if (!isRsrcLegal) {
6787 // Legalize a VGPR Rsrc
6788 //
6789 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6790 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6791 // a zero-value SRsrc.
6792 //
6793 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6794 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6795 // above.
6796 //
6797 // Otherwise we are on non-ADDR64 hardware, and/or we have
6798 // idxen/offen/bothen and we fall back to a waterfall loop.
6799
6800 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6801 MachineBasicBlock &MBB = *MI.getParent();
6802
6803 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6804 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6805 // This is already an ADDR64 instruction so we need to add the pointer
6806 // extracted from the resource descriptor to the current value of VAddr.
6807 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6808 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6809 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6810
6811 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6812 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
6813 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
6814
6815 unsigned RsrcPtr, NewSRsrc;
6816 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6817
6818 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
6819 const DebugLoc &DL = MI.getDebugLoc();
6820 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
6821 .addDef(CondReg0)
6822 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6823 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
6824 .addImm(0);
6825
6826 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
6827 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
6828 .addDef(CondReg1, RegState::Dead)
6829 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6830 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
6831 .addReg(CondReg0, RegState::Kill)
6832 .addImm(0);
6833
6834 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6835 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
6836 .addReg(NewVAddrLo)
6837 .addImm(AMDGPU::sub0)
6838 .addReg(NewVAddrHi)
6839 .addImm(AMDGPU::sub1);
6840
6841 VAddr->setReg(NewVAddr);
6842 Rsrc->setReg(NewSRsrc);
6843 } else if (!VAddr && ST.hasAddr64()) {
6844 // This instructions is the _OFFSET variant, so we need to convert it to
6845 // ADDR64.
6847 "FIXME: Need to emit flat atomics here");
6848
6849 unsigned RsrcPtr, NewSRsrc;
6850 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6851
6852 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6853 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
6854 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
6855 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6856 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
6857
6858 // Atomics with return have an additional tied operand and are
6859 // missing some of the special bits.
6860 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
6861 MachineInstr *Addr64;
6862
6863 if (!VDataIn) {
6864 // Regular buffer load / store.
6866 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6867 .add(*VData)
6868 .addReg(NewVAddr)
6869 .addReg(NewSRsrc)
6870 .add(*SOffset)
6871 .add(*Offset);
6872
6873 if (const MachineOperand *CPol =
6874 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6875 MIB.addImm(CPol->getImm());
6876 }
6877
6878 if (const MachineOperand *TFE =
6879 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
6880 MIB.addImm(TFE->getImm());
6881 }
6882
6883 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
6884
6885 MIB.cloneMemRefs(MI);
6886 Addr64 = MIB;
6887 } else {
6888 // Atomics with return.
6889 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6890 .add(*VData)
6891 .add(*VDataIn)
6892 .addReg(NewVAddr)
6893 .addReg(NewSRsrc)
6894 .add(*SOffset)
6895 .add(*Offset)
6896 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
6897 .cloneMemRefs(MI);
6898 }
6899
6900 MI.removeFromParent();
6901
6902 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6903 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
6904 NewVAddr)
6905 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6906 .addImm(AMDGPU::sub0)
6907 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6908 .addImm(AMDGPU::sub1);
6909 } else {
6910 // Legalize a VGPR Rsrc and soffset together.
6911 if (!isSoffsetLegal) {
6912 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6913 CreatedBB =
6914 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
6915 return CreatedBB;
6916 }
6917 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
6918 return CreatedBB;
6919 }
6920 }
6921
6922 // Legalize a VGPR soffset.
6923 if (!isSoffsetLegal) {
6924 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6925 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
6926 return CreatedBB;
6927 }
6928 return CreatedBB;
6929}
6930
6932 InstrList.insert(MI);
6933 // Add MBUF instructiosn to deferred list.
6934 int RsrcIdx =
6935 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
6936 if (RsrcIdx != -1) {
6937 DeferredList.insert(MI);
6938 }
6939}
6940
6942 return DeferredList.contains(MI);
6943}
6944
6946 MachineDominatorTree *MDT) const {
6947
6948 while (!Worklist.empty()) {
6949 MachineInstr &Inst = *Worklist.top();
6950 Worklist.erase_top();
6951 // Skip MachineInstr in the deferred list.
6952 if (Worklist.isDeferred(&Inst))
6953 continue;
6954 moveToVALUImpl(Worklist, MDT, Inst);
6955 }
6956
6957 // Deferred list of instructions will be processed once
6958 // all the MachineInstr in the worklist are done.
6959 for (MachineInstr *Inst : Worklist.getDeferredList()) {
6960 moveToVALUImpl(Worklist, MDT, *Inst);
6961 assert(Worklist.empty() &&
6962 "Deferred MachineInstr are not supposed to re-populate worklist");
6963 }
6964}
6965
6968 MachineInstr &Inst) const {
6969
6971 if (!MBB)
6972 return;
6974 unsigned Opcode = Inst.getOpcode();
6975 unsigned NewOpcode = getVALUOp(Inst);
6976 // Handle some special cases
6977 switch (Opcode) {
6978 default:
6979 break;
6980 case AMDGPU::S_ADD_U64_PSEUDO:
6981 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
6982 break;
6983 case AMDGPU::S_SUB_U64_PSEUDO:
6984 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
6985 break;
6986 case AMDGPU::S_ADD_I32:
6987 case AMDGPU::S_SUB_I32: {
6988 // FIXME: The u32 versions currently selected use the carry.
6989 bool Changed;
6990 MachineBasicBlock *CreatedBBTmp = nullptr;
6991 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
6992 if (Changed)
6993 return;
6994
6995 // Default handling
6996 break;
6997 }
6998
6999 case AMDGPU::S_MUL_U64:
7000 // Split s_mul_u64 in 32-bit vector multiplications.
7001 splitScalarSMulU64(Worklist, Inst, MDT);
7002 Inst.eraseFromParent();
7003 return;
7004
7005 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7006 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7007 // This is a special case of s_mul_u64 where all the operands are either
7008 // zero extended or sign extended.
7009 splitScalarSMulPseudo(Worklist, Inst, MDT);
7010 Inst.eraseFromParent();
7011 return;
7012
7013 case AMDGPU::S_AND_B64:
7014 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7015 Inst.eraseFromParent();
7016 return;
7017
7018 case AMDGPU::S_OR_B64:
7019 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7020 Inst.eraseFromParent();
7021 return;
7022
7023 case AMDGPU::S_XOR_B64:
7024 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7025 Inst.eraseFromParent();
7026 return;
7027
7028 case AMDGPU::S_NAND_B64:
7029 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7030 Inst.eraseFromParent();
7031 return;
7032
7033 case AMDGPU::S_NOR_B64:
7034 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7035 Inst.eraseFromParent();
7036 return;
7037
7038 case AMDGPU::S_XNOR_B64:
7039 if (ST.hasDLInsts())
7040 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7041 else
7042 splitScalar64BitXnor(Worklist, Inst, MDT);
7043 Inst.eraseFromParent();
7044 return;
7045
7046 case AMDGPU::S_ANDN2_B64:
7047 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7048 Inst.eraseFromParent();
7049 return;
7050
7051 case AMDGPU::S_ORN2_B64:
7052 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7053 Inst.eraseFromParent();
7054 return;
7055
7056 case AMDGPU::S_BREV_B64:
7057 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7058 Inst.eraseFromParent();
7059 return;
7060
7061 case AMDGPU::S_NOT_B64:
7062 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7063 Inst.eraseFromParent();
7064 return;
7065
7066 case AMDGPU::S_BCNT1_I32_B64:
7067 splitScalar64BitBCNT(Worklist, Inst);
7068 Inst.eraseFromParent();
7069 return;
7070
7071 case AMDGPU::S_BFE_I64:
7072 splitScalar64BitBFE(Worklist, Inst);
7073 Inst.eraseFromParent();
7074 return;
7075
7076 case AMDGPU::S_FLBIT_I32_B64:
7077 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7078 Inst.eraseFromParent();
7079 return;
7080 case AMDGPU::S_FF1_I32_B64:
7081 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7082 Inst.eraseFromParent();
7083 return;
7084
7085 case AMDGPU::S_LSHL_B32:
7086 if (ST.hasOnlyRevVALUShifts()) {
7087 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7088 swapOperands(Inst);
7089 }
7090 break;
7091 case AMDGPU::S_ASHR_I32:
7092 if (ST.hasOnlyRevVALUShifts()) {
7093 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7094 swapOperands(Inst);
7095 }
7096 break;
7097 case AMDGPU::S_LSHR_B32:
7098 if (ST.hasOnlyRevVALUShifts()) {
7099 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7100 swapOperands(Inst);
7101 }
7102 break;
7103 case AMDGPU::S_LSHL_B64:
7104 if (ST.hasOnlyRevVALUShifts()) {
7105 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7106 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7107 : AMDGPU::V_LSHLREV_B64_e64;
7108 swapOperands(Inst);
7109 }
7110 break;
7111 case AMDGPU::S_ASHR_I64:
7112 if (ST.hasOnlyRevVALUShifts()) {
7113 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7114 swapOperands(Inst);
7115 }
7116 break;
7117 case AMDGPU::S_LSHR_B64:
7118 if (ST.hasOnlyRevVALUShifts()) {
7119 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7120 swapOperands(Inst);
7121 }
7122 break;
7123
7124 case AMDGPU::S_ABS_I32:
7125 lowerScalarAbs(Worklist, Inst);
7126 Inst.eraseFromParent();
7127 return;
7128
7129 case AMDGPU::S_CBRANCH_SCC0:
7130 case AMDGPU::S_CBRANCH_SCC1: {
7131 // Clear unused bits of vcc
7132 Register CondReg = Inst.getOperand(1).getReg();
7133 bool IsSCC = CondReg == AMDGPU::SCC;
7134 Register VCC = RI.getVCC();
7135 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7136 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7137 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7138 .addReg(EXEC)
7139 .addReg(IsSCC ? VCC : CondReg);
7140 Inst.removeOperand(1);
7141 } break;
7142
7143 case AMDGPU::S_BFE_U64:
7144 case AMDGPU::S_BFM_B64:
7145 llvm_unreachable("Moving this op to VALU not implemented");
7146
7147 case AMDGPU::S_PACK_LL_B32_B16:
7148 case AMDGPU::S_PACK_LH_B32_B16:
7149 case AMDGPU::S_PACK_HL_B32_B16:
7150 case AMDGPU::S_PACK_HH_B32_B16:
7151 movePackToVALU(Worklist, MRI, Inst);
7152 Inst.eraseFromParent();
7153 return;
7154
7155 case AMDGPU::S_XNOR_B32:
7156 lowerScalarXnor(Worklist, Inst);
7157 Inst.eraseFromParent();
7158 return;
7159
7160 case AMDGPU::S_NAND_B32:
7161 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7162 Inst.eraseFromParent();
7163 return;
7164
7165 case AMDGPU::S_NOR_B32:
7166 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7167 Inst.eraseFromParent();
7168 return;
7169
7170 case AMDGPU::S_ANDN2_B32:
7171 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7172 Inst.eraseFromParent();
7173 return;
7174
7175 case AMDGPU::S_ORN2_B32:
7176 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7177 Inst.eraseFromParent();
7178 return;
7179
7180 // TODO: remove as soon as everything is ready
7181 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7182 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7183 // can only be selected from the uniform SDNode.
7184 case AMDGPU::S_ADD_CO_PSEUDO:
7185 case AMDGPU::S_SUB_CO_PSEUDO: {
7186 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7187 ? AMDGPU::V_ADDC_U32_e64
7188 : AMDGPU::V_SUBB_U32_e64;
7189 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7190
7191 Register CarryInReg = Inst.getOperand(4).getReg();
7192 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7193 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7194 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7195 .addReg(CarryInReg);
7196 }
7197
7198 Register CarryOutReg = Inst.getOperand(1).getReg();
7199
7200 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7201 MRI.getRegClass(Inst.getOperand(0).getReg())));
7202 MachineInstr *CarryOp =
7203 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7204 .addReg(CarryOutReg, RegState::Define)
7205 .add(Inst.getOperand(2))
7206 .add(Inst.getOperand(3))
7207 .addReg(CarryInReg)
7208 .addImm(0);
7209 legalizeOperands(*CarryOp);
7210 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7211 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7212 Inst.eraseFromParent();
7213 }
7214 return;
7215 case AMDGPU::S_UADDO_PSEUDO:
7216 case AMDGPU::S_USUBO_PSEUDO: {
7217 const DebugLoc &DL = Inst.getDebugLoc();
7218 MachineOperand &Dest0 = Inst.getOperand(0);
7219 MachineOperand &Dest1 = Inst.getOperand(1);
7220 MachineOperand &Src0 = Inst.getOperand(2);
7221 MachineOperand &Src1 = Inst.getOperand(3);
7222
7223 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7224 ? AMDGPU::V_ADD_CO_U32_e64
7225 : AMDGPU::V_SUB_CO_U32_e64;
7226 const TargetRegisterClass *NewRC =
7227 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7228 Register DestReg = MRI.createVirtualRegister(NewRC);
7229 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7230 .addReg(Dest1.getReg(), RegState::Define)
7231 .add(Src0)
7232 .add(Src1)
7233 .addImm(0); // clamp bit
7234
7235 legalizeOperands(*NewInstr, MDT);
7236 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7237 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7238 Worklist);
7239 Inst.eraseFromParent();
7240 }
7241 return;
7242
7243 case AMDGPU::S_CSELECT_B32:
7244 case AMDGPU::S_CSELECT_B64:
7245 lowerSelect(Worklist, Inst, MDT);
7246 Inst.eraseFromParent();
7247 return;
7248 case AMDGPU::S_CMP_EQ_I32:
7249 case AMDGPU::S_CMP_LG_I32:
7250 case AMDGPU::S_CMP_GT_I32:
7251 case AMDGPU::S_CMP_GE_I32:
7252 case AMDGPU::S_CMP_LT_I32:
7253 case AMDGPU::S_CMP_LE_I32:
7254 case AMDGPU::S_CMP_EQ_U32:
7255 case AMDGPU::S_CMP_LG_U32:
7256 case AMDGPU::S_CMP_GT_U32:
7257 case AMDGPU::S_CMP_GE_U32:
7258 case AMDGPU::S_CMP_LT_U32:
7259 case AMDGPU::S_CMP_LE_U32:
7260 case AMDGPU::S_CMP_EQ_U64:
7261 case AMDGPU::S_CMP_LG_U64:
7262 case AMDGPU::S_CMP_LT_F32:
7263 case AMDGPU::S_CMP_EQ_F32:
7264 case AMDGPU::S_CMP_LE_F32:
7265 case AMDGPU::S_CMP_GT_F32:
7266 case AMDGPU::S_CMP_LG_F32:
7267 case AMDGPU::S_CMP_GE_F32:
7268 case AMDGPU::S_CMP_O_F32:
7269 case AMDGPU::S_CMP_U_F32:
7270 case AMDGPU::S_CMP_NGE_F32:
7271 case AMDGPU::S_CMP_NLG_F32:
7272 case AMDGPU::S_CMP_NGT_F32:
7273 case AMDGPU::S_CMP_NLE_F32:
7274 case AMDGPU::S_CMP_NEQ_F32:
7275 case AMDGPU::S_CMP_NLT_F32:
7276 case AMDGPU::S_CMP_LT_F16:
7277 case AMDGPU::S_CMP_EQ_F16:
7278 case AMDGPU::S_CMP_LE_F16:
7279 case AMDGPU::S_CMP_GT_F16:
7280 case AMDGPU::S_CMP_LG_F16:
7281 case AMDGPU::S_CMP_GE_F16:
7282 case AMDGPU::S_CMP_O_F16:
7283 case AMDGPU::S_CMP_U_F16:
7284 case AMDGPU::S_CMP_NGE_F16:
7285 case AMDGPU::S_CMP_NLG_F16:
7286 case AMDGPU::S_CMP_NGT_F16:
7287 case AMDGPU::S_CMP_NLE_F16:
7288 case AMDGPU::S_CMP_NEQ_F16:
7289 case AMDGPU::S_CMP_NLT_F16: {
7290 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7291 auto NewInstr =
7292 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7293 .setMIFlags(Inst.getFlags());
7294 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7295 AMDGPU::OpName::src0_modifiers) >= 0) {
7296 NewInstr
7297 .addImm(0) // src0_modifiers
7298 .add(Inst.getOperand(0)) // src0
7299 .addImm(0) // src1_modifiers
7300 .add(Inst.getOperand(1)) // src1
7301 .addImm(0); // clamp
7302 } else {
7303 NewInstr
7304 .add(Inst.getOperand(0))
7305 .add(Inst.getOperand(1));
7306 }
7307 legalizeOperands(*NewInstr, MDT);
7308 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7309 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7310 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7311 Inst.eraseFromParent();
7312 return;
7313 }
7314 case AMDGPU::S_CVT_HI_F32_F16: {
7315 const DebugLoc &DL = Inst.getDebugLoc();
7316 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7317 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7318 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7319 .addImm(16)
7320 .add(Inst.getOperand(1));
7321 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7322 .addImm(0) // src0_modifiers
7323 .addReg(TmpReg)
7324 .addImm(0) // clamp
7325 .addImm(0); // omod
7326
7327 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7328 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7329 Inst.eraseFromParent();
7330 return;
7331 }
7332 case AMDGPU::S_MINIMUM_F32:
7333 case AMDGPU::S_MAXIMUM_F32:
7334 case AMDGPU::S_MINIMUM_F16:
7335 case AMDGPU::S_MAXIMUM_F16: {
7336 const DebugLoc &DL = Inst.getDebugLoc();
7337 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7338 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7339 .addImm(0) // src0_modifiers
7340 .add(Inst.getOperand(1))
7341 .addImm(0) // src1_modifiers
7342 .add(Inst.getOperand(2))
7343 .addImm(0) // clamp
7344 .addImm(0); // omod
7345 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7346
7347 legalizeOperands(*NewInstr, MDT);
7348 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7349 Inst.eraseFromParent();
7350 return;
7351 }
7352 }
7353
7354 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7355 // We cannot move this instruction to the VALU, so we should try to
7356 // legalize its operands instead.
7357 legalizeOperands(Inst, MDT);
7358 return;
7359 }
7360 // Handle converting generic instructions like COPY-to-SGPR into
7361 // COPY-to-VGPR.
7362 if (NewOpcode == Opcode) {
7363 Register DstReg = Inst.getOperand(0).getReg();
7364 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7365
7366 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7367 // hope for the best.
7368 if (Inst.isCopy() && DstReg.isPhysical() &&
7369 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7370 // TODO: Only works for 32 bit registers.
7371 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7372 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7373 .add(Inst.getOperand(1));
7374 Inst.eraseFromParent();
7375 return;
7376 }
7377
7378 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7379 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7380 // Instead of creating a copy where src and dst are the same register
7381 // class, we just replace all uses of dst with src. These kinds of
7382 // copies interfere with the heuristics MachineSink uses to decide
7383 // whether or not to split a critical edge. Since the pass assumes
7384 // that copies will end up as machine instructions and not be
7385 // eliminated.
7386 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7387 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7388 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7389 Inst.getOperand(0).setReg(DstReg);
7390 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7391 // these are deleted later, but at -O0 it would leave a suspicious
7392 // looking illegal copy of an undef register.
7393 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7394 Inst.removeOperand(I);
7395 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7396 return;
7397 }
7398 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7399 MRI.replaceRegWith(DstReg, NewDstReg);
7400 legalizeOperands(Inst, MDT);
7401 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7402 return;
7403 }
7404
7405 // Use the new VALU Opcode.
7406 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7407 .setMIFlags(Inst.getFlags());
7408 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7409 // Intersperse VOP3 modifiers among the SALU operands.
7410 NewInstr->addOperand(Inst.getOperand(0));
7411 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7412 AMDGPU::OpName::src0_modifiers) >= 0)
7413 NewInstr.addImm(0);
7414 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7415 MachineOperand Src = Inst.getOperand(1);
7416 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7417 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7418 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7419 else
7420 NewInstr->addOperand(Src);
7421 }
7422
7423 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7424 // We are converting these to a BFE, so we need to add the missing
7425 // operands for the size and offset.
7426 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7427 NewInstr.addImm(0);
7428 NewInstr.addImm(Size);
7429 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7430 // The VALU version adds the second operand to the result, so insert an
7431 // extra 0 operand.
7432 NewInstr.addImm(0);
7433 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7434 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7435 // If we need to move this to VGPRs, we need to unpack the second
7436 // operand back into the 2 separate ones for bit offset and width.
7437 assert(OffsetWidthOp.isImm() &&
7438 "Scalar BFE is only implemented for constant width and offset");
7439 uint32_t Imm = OffsetWidthOp.getImm();
7440
7441 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7442 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7443 NewInstr.addImm(Offset);
7444 NewInstr.addImm(BitWidth);
7445 } else {
7446 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7447 AMDGPU::OpName::src1_modifiers) >= 0)
7448 NewInstr.addImm(0);
7449 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7450 NewInstr->addOperand(Inst.getOperand(2));
7451 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7452 AMDGPU::OpName::src2_modifiers) >= 0)
7453 NewInstr.addImm(0);
7454 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7455 NewInstr->addOperand(Inst.getOperand(3));
7456 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7457 NewInstr.addImm(0);
7458 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7459 NewInstr.addImm(0);
7460 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7461 NewInstr.addImm(0);
7462 }
7463 } else {
7464 // Just copy the SALU operands.
7465 for (const MachineOperand &Op : Inst.explicit_operands())
7466 NewInstr->addOperand(Op);
7467 }
7468
7469 // Remove any references to SCC. Vector instructions can't read from it, and
7470 // We're just about to add the implicit use / defs of VCC, and we don't want
7471 // both.
7472 for (MachineOperand &Op : Inst.implicit_operands()) {
7473 if (Op.getReg() == AMDGPU::SCC) {
7474 // Only propagate through live-def of SCC.
7475 if (Op.isDef() && !Op.isDead())
7476 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7477 if (Op.isUse())
7478 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7479 }
7480 }
7481 Inst.eraseFromParent();
7482 Register NewDstReg;
7483 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7484 Register DstReg = NewInstr->getOperand(0).getReg();
7485 assert(DstReg.isVirtual());
7486 // Update the destination register class.
7487 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7488 assert(NewDstRC);
7489 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7490 MRI.replaceRegWith(DstReg, NewDstReg);
7491 }
7492 fixImplicitOperands(*NewInstr);
7493 // Legalize the operands
7494 legalizeOperands(*NewInstr, MDT);
7495 if (NewDstReg)
7496 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7497}
7498
7499// Add/sub require special handling to deal with carry outs.
7500std::pair<bool, MachineBasicBlock *>
7501SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7502 MachineDominatorTree *MDT) const {
7503 if (ST.hasAddNoCarry()) {
7504 // Assume there is no user of scc since we don't select this in that case.
7505 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7506 // is used.
7507
7508 MachineBasicBlock &MBB = *Inst.getParent();
7510
7511 Register OldDstReg = Inst.getOperand(0).getReg();
7512 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7513
7514 unsigned Opc = Inst.getOpcode();
7515 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7516
7517 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7518 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7519
7520 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7521 Inst.removeOperand(3);
7522
7523 Inst.setDesc(get(NewOpc));
7524 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7526 MRI.replaceRegWith(OldDstReg, ResultReg);
7527 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7528
7529 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7530 return std::pair(true, NewBB);
7531 }
7532
7533 return std::pair(false, nullptr);
7534}
7535
7536void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7537 MachineDominatorTree *MDT) const {
7538
7539 MachineBasicBlock &MBB = *Inst.getParent();
7541 MachineBasicBlock::iterator MII = Inst;
7542 DebugLoc DL = Inst.getDebugLoc();
7543
7544 MachineOperand &Dest = Inst.getOperand(0);
7545 MachineOperand &Src0 = Inst.getOperand(1);
7546 MachineOperand &Src1 = Inst.getOperand(2);
7547 MachineOperand &Cond = Inst.getOperand(3);
7548
7549 Register CondReg = Cond.getReg();
7550 bool IsSCC = (CondReg == AMDGPU::SCC);
7551
7552 // If this is a trivial select where the condition is effectively not SCC
7553 // (CondReg is a source of copy to SCC), then the select is semantically
7554 // equivalent to copying CondReg. Hence, there is no need to create
7555 // V_CNDMASK, we can just use that and bail out.
7556 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7557 (Src1.getImm() == 0)) {
7558 MRI.replaceRegWith(Dest.getReg(), CondReg);
7559 return;
7560 }
7561
7562 Register NewCondReg = CondReg;
7563 if (IsSCC) {
7564 const TargetRegisterClass *TC =
7565 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7566 NewCondReg = MRI.createVirtualRegister(TC);
7567
7568 // Now look for the closest SCC def if it is a copy
7569 // replacing the CondReg with the COPY source register
7570 bool CopyFound = false;
7571 for (MachineInstr &CandI :
7573 Inst.getParent()->rend())) {
7574 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
7575 -1) {
7576 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7577 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7578 .addReg(CandI.getOperand(1).getReg());
7579 CopyFound = true;
7580 }
7581 break;
7582 }
7583 }
7584 if (!CopyFound) {
7585 // SCC def is not a copy
7586 // Insert a trivial select instead of creating a copy, because a copy from
7587 // SCC would semantically mean just copying a single bit, but we may need
7588 // the result to be a vector condition mask that needs preserving.
7589 unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
7590 : AMDGPU::S_CSELECT_B32;
7591 auto NewSelect =
7592 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7593 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7594 }
7595 }
7596
7597 Register NewDestReg = MRI.createVirtualRegister(
7598 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7599 MachineInstr *NewInst;
7600 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7601 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7602 .addImm(0)
7603 .add(Src1) // False
7604 .addImm(0)
7605 .add(Src0) // True
7606 .addReg(NewCondReg);
7607 } else {
7608 NewInst =
7609 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7610 .add(Src1) // False
7611 .add(Src0) // True
7612 .addReg(NewCondReg);
7613 }
7614 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7615 legalizeOperands(*NewInst, MDT);
7616 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7617}
7618
7619void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7620 MachineInstr &Inst) const {
7621 MachineBasicBlock &MBB = *Inst.getParent();
7623 MachineBasicBlock::iterator MII = Inst;
7624 DebugLoc DL = Inst.getDebugLoc();
7625
7626 MachineOperand &Dest = Inst.getOperand(0);
7627 MachineOperand &Src = Inst.getOperand(1);
7628 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7629 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7630
7631 unsigned SubOp = ST.hasAddNoCarry() ?
7632 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7633
7634 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7635 .addImm(0)
7636 .addReg(Src.getReg());
7637
7638 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7639 .addReg(Src.getReg())
7640 .addReg(TmpReg);
7641
7642 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7643 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7644}
7645
7646void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7647 MachineInstr &Inst) const {
7648 MachineBasicBlock &MBB = *Inst.getParent();
7650 MachineBasicBlock::iterator MII = Inst;
7651 const DebugLoc &DL = Inst.getDebugLoc();
7652
7653 MachineOperand &Dest = Inst.getOperand(0);
7654 MachineOperand &Src0 = Inst.getOperand(1);
7655 MachineOperand &Src1 = Inst.getOperand(2);
7656
7657 if (ST.hasDLInsts()) {
7658 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7659 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7660 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7661
7662 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7663 .add(Src0)
7664 .add(Src1);
7665
7666 MRI.replaceRegWith(Dest.getReg(), NewDest);
7667 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7668 } else {
7669 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7670 // invert either source and then perform the XOR. If either source is a
7671 // scalar register, then we can leave the inversion on the scalar unit to
7672 // achieve a better distribution of scalar and vector instructions.
7673 bool Src0IsSGPR = Src0.isReg() &&
7674 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7675 bool Src1IsSGPR = Src1.isReg() &&
7676 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7678 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7679 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7680
7681 // Build a pair of scalar instructions and add them to the work list.
7682 // The next iteration over the work list will lower these to the vector
7683 // unit as necessary.
7684 if (Src0IsSGPR) {
7685 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7686 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7687 .addReg(Temp)
7688 .add(Src1);
7689 } else if (Src1IsSGPR) {
7690 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7691 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7692 .add(Src0)
7693 .addReg(Temp);
7694 } else {
7695 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7696 .add(Src0)
7697 .add(Src1);
7698 MachineInstr *Not =
7699 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7700 Worklist.insert(Not);
7701 }
7702
7703 MRI.replaceRegWith(Dest.getReg(), NewDest);
7704
7705 Worklist.insert(Xor);
7706
7707 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7708 }
7709}
7710
7711void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7712 MachineInstr &Inst,
7713 unsigned Opcode) const {
7714 MachineBasicBlock &MBB = *Inst.getParent();
7716 MachineBasicBlock::iterator MII = Inst;
7717 const DebugLoc &DL = Inst.getDebugLoc();
7718
7719 MachineOperand &Dest = Inst.getOperand(0);
7720 MachineOperand &Src0 = Inst.getOperand(1);
7721 MachineOperand &Src1 = Inst.getOperand(2);
7722
7723 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7724 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7725
7726 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7727 .add(Src0)
7728 .add(Src1);
7729
7730 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7731 .addReg(Interm);
7732
7733 Worklist.insert(&Op);
7734 Worklist.insert(&Not);
7735
7736 MRI.replaceRegWith(Dest.getReg(), NewDest);
7737 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7738}
7739
7740void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7741 MachineInstr &Inst,
7742 unsigned Opcode) const {
7743 MachineBasicBlock &MBB = *Inst.getParent();
7745 MachineBasicBlock::iterator MII = Inst;
7746 const DebugLoc &DL = Inst.getDebugLoc();
7747
7748 MachineOperand &Dest = Inst.getOperand(0);
7749 MachineOperand &Src0 = Inst.getOperand(1);
7750 MachineOperand &Src1 = Inst.getOperand(2);
7751
7752 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7753 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7754
7755 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7756 .add(Src1);
7757
7758 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7759 .add(Src0)
7760 .addReg(Interm);
7761
7762 Worklist.insert(&Not);
7763 Worklist.insert(&Op);
7764
7765 MRI.replaceRegWith(Dest.getReg(), NewDest);
7766 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7767}
7768
7769void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
7770 MachineInstr &Inst, unsigned Opcode,
7771 bool Swap) const {
7772 MachineBasicBlock &MBB = *Inst.getParent();
7774
7775 MachineOperand &Dest = Inst.getOperand(0);
7776 MachineOperand &Src0 = Inst.getOperand(1);
7777 DebugLoc DL = Inst.getDebugLoc();
7778
7779 MachineBasicBlock::iterator MII = Inst;
7780
7781 const MCInstrDesc &InstDesc = get(Opcode);
7782 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7783 MRI.getRegClass(Src0.getReg()) :
7784 &AMDGPU::SGPR_32RegClass;
7785
7786 const TargetRegisterClass *Src0SubRC =
7787 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7788
7789 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7790 AMDGPU::sub0, Src0SubRC);
7791
7792 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7793 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7794 const TargetRegisterClass *NewDestSubRC =
7795 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7796
7797 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7798 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
7799
7800 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7801 AMDGPU::sub1, Src0SubRC);
7802
7803 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
7804 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
7805
7806 if (Swap)
7807 std::swap(DestSub0, DestSub1);
7808
7809 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
7810 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7811 .addReg(DestSub0)
7812 .addImm(AMDGPU::sub0)
7813 .addReg(DestSub1)
7814 .addImm(AMDGPU::sub1);
7815
7816 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7817
7818 Worklist.insert(&LoHalf);
7819 Worklist.insert(&HiHalf);
7820
7821 // We don't need to legalizeOperands here because for a single operand, src0
7822 // will support any kind of input.
7823
7824 // Move all users of this moved value.
7825 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7826}
7827
7828// There is not a vector equivalent of s_mul_u64. For this reason, we need to
7829// split the s_mul_u64 in 32-bit vector multiplications.
7830void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
7831 MachineInstr &Inst,
7832 MachineDominatorTree *MDT) const {
7833 MachineBasicBlock &MBB = *Inst.getParent();
7835
7836 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7837 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7838 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7839
7840 MachineOperand &Dest = Inst.getOperand(0);
7841 MachineOperand &Src0 = Inst.getOperand(1);
7842 MachineOperand &Src1 = Inst.getOperand(2);
7843 const DebugLoc &DL = Inst.getDebugLoc();
7844 MachineBasicBlock::iterator MII = Inst;
7845
7846 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7847 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7848 const TargetRegisterClass *Src0SubRC =
7849 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7850 if (RI.isSGPRClass(Src0SubRC))
7851 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7852 const TargetRegisterClass *Src1SubRC =
7853 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7854 if (RI.isSGPRClass(Src1SubRC))
7855 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7856
7857 // First, we extract the low 32-bit and high 32-bit values from each of the
7858 // operands.
7859 MachineOperand Op0L =
7860 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7861 MachineOperand Op1L =
7862 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7863 MachineOperand Op0H =
7864 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
7865 MachineOperand Op1H =
7866 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
7867
7868 // The multilication is done as follows:
7869 //
7870 // Op1H Op1L
7871 // * Op0H Op0L
7872 // --------------------
7873 // Op1H*Op0L Op1L*Op0L
7874 // + Op1H*Op0H Op1L*Op0H
7875 // -----------------------------------------
7876 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
7877 //
7878 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
7879 // value and that would overflow.
7880 // The low 32-bit value is Op1L*Op0L.
7881 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
7882
7883 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7884 MachineInstr *Op1L_Op0H =
7885 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
7886 .add(Op1L)
7887 .add(Op0H);
7888
7889 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7890 MachineInstr *Op1H_Op0L =
7891 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
7892 .add(Op1H)
7893 .add(Op0L);
7894
7895 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7896 MachineInstr *Carry =
7897 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
7898 .add(Op1L)
7899 .add(Op0L);
7900
7901 MachineInstr *LoHalf =
7902 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7903 .add(Op1L)
7904 .add(Op0L);
7905
7906 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7907 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
7908 .addReg(Op1L_Op0H_Reg)
7909 .addReg(Op1H_Op0L_Reg);
7910
7911 MachineInstr *HiHalf =
7912 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
7913 .addReg(AddReg)
7914 .addReg(CarryReg);
7915
7916 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7917 .addReg(DestSub0)
7918 .addImm(AMDGPU::sub0)
7919 .addReg(DestSub1)
7920 .addImm(AMDGPU::sub1);
7921
7922 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7923
7924 // Try to legalize the operands in case we need to swap the order to keep it
7925 // valid.
7926 legalizeOperands(*Op1L_Op0H, MDT);
7927 legalizeOperands(*Op1H_Op0L, MDT);
7928 legalizeOperands(*Carry, MDT);
7929 legalizeOperands(*LoHalf, MDT);
7930 legalizeOperands(*Add, MDT);
7931 legalizeOperands(*HiHalf, MDT);
7932
7933 // Move all users of this moved value.
7934 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7935}
7936
7937// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
7938// multiplications.
7939void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
7940 MachineInstr &Inst,
7941 MachineDominatorTree *MDT) const {
7942 MachineBasicBlock &MBB = *Inst.getParent();
7944
7945 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7946 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7947 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7948
7949 MachineOperand &Dest = Inst.getOperand(0);
7950 MachineOperand &Src0 = Inst.getOperand(1);
7951 MachineOperand &Src1 = Inst.getOperand(2);
7952 const DebugLoc &DL = Inst.getDebugLoc();
7953 MachineBasicBlock::iterator MII = Inst;
7954
7955 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7956 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7957 const TargetRegisterClass *Src0SubRC =
7958 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7959 if (RI.isSGPRClass(Src0SubRC))
7960 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7961 const TargetRegisterClass *Src1SubRC =
7962 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7963 if (RI.isSGPRClass(Src1SubRC))
7964 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7965
7966 // First, we extract the low 32-bit and high 32-bit values from each of the
7967 // operands.
7968 MachineOperand Op0L =
7969 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7970 MachineOperand Op1L =
7971 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7972
7973 unsigned Opc = Inst.getOpcode();
7974 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
7975 ? AMDGPU::V_MUL_HI_U32_e64
7976 : AMDGPU::V_MUL_HI_I32_e64;
7977 MachineInstr *HiHalf =
7978 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
7979
7980 MachineInstr *LoHalf =
7981 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7982 .add(Op1L)
7983 .add(Op0L);
7984
7985 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7986 .addReg(DestSub0)
7987 .addImm(AMDGPU::sub0)
7988 .addReg(DestSub1)
7989 .addImm(AMDGPU::sub1);
7990
7991 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7992
7993 // Try to legalize the operands in case we need to swap the order to keep it
7994 // valid.
7995 legalizeOperands(*HiHalf, MDT);
7996 legalizeOperands(*LoHalf, MDT);
7997
7998 // Move all users of this moved value.
7999 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8000}
8001
8002void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8003 MachineInstr &Inst, unsigned Opcode,
8004 MachineDominatorTree *MDT) const {
8005 MachineBasicBlock &MBB = *Inst.getParent();
8007
8008 MachineOperand &Dest = Inst.getOperand(0);
8009 MachineOperand &Src0 = Inst.getOperand(1);
8010 MachineOperand &Src1 = Inst.getOperand(2);
8011 DebugLoc DL = Inst.getDebugLoc();
8012
8013 MachineBasicBlock::iterator MII = Inst;
8014
8015 const MCInstrDesc &InstDesc = get(Opcode);
8016 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8017 MRI.getRegClass(Src0.getReg()) :
8018 &AMDGPU::SGPR_32RegClass;
8019
8020 const TargetRegisterClass *Src0SubRC =
8021 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8022 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8023 MRI.getRegClass(Src1.getReg()) :
8024 &AMDGPU::SGPR_32RegClass;
8025
8026 const TargetRegisterClass *Src1SubRC =
8027 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8028
8029 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8030 AMDGPU::sub0, Src0SubRC);
8031 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8032 AMDGPU::sub0, Src1SubRC);
8033 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8034 AMDGPU::sub1, Src0SubRC);
8035 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8036 AMDGPU::sub1, Src1SubRC);
8037
8038 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8039 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8040 const TargetRegisterClass *NewDestSubRC =
8041 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8042
8043 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8044 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8045 .add(SrcReg0Sub0)
8046 .add(SrcReg1Sub0);
8047
8048 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8049 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8050 .add(SrcReg0Sub1)
8051 .add(SrcReg1Sub1);
8052
8053 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8054 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8055 .addReg(DestSub0)
8056 .addImm(AMDGPU::sub0)
8057 .addReg(DestSub1)
8058 .addImm(AMDGPU::sub1);
8059
8060 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8061
8062 Worklist.insert(&LoHalf);
8063 Worklist.insert(&HiHalf);
8064
8065 // Move all users of this moved value.
8066 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8067}
8068
8069void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8070 MachineInstr &Inst,
8071 MachineDominatorTree *MDT) const {
8072 MachineBasicBlock &MBB = *Inst.getParent();
8074
8075 MachineOperand &Dest = Inst.getOperand(0);
8076 MachineOperand &Src0 = Inst.getOperand(1);
8077 MachineOperand &Src1 = Inst.getOperand(2);
8078 const DebugLoc &DL = Inst.getDebugLoc();
8079
8080 MachineBasicBlock::iterator MII = Inst;
8081
8082 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8083
8084 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8085
8086 MachineOperand* Op0;
8087 MachineOperand* Op1;
8088
8089 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8090 Op0 = &Src0;
8091 Op1 = &Src1;
8092 } else {
8093 Op0 = &Src1;
8094 Op1 = &Src0;
8095 }
8096
8097 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8098 .add(*Op0);
8099
8100 Register NewDest = MRI.createVirtualRegister(DestRC);
8101
8102 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8103 .addReg(Interm)
8104 .add(*Op1);
8105
8106 MRI.replaceRegWith(Dest.getReg(), NewDest);
8107
8108 Worklist.insert(&Xor);
8109}
8110
8111void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8112 MachineInstr &Inst) const {
8113 MachineBasicBlock &MBB = *Inst.getParent();
8115
8116 MachineBasicBlock::iterator MII = Inst;
8117 const DebugLoc &DL = Inst.getDebugLoc();
8118
8119 MachineOperand &Dest = Inst.getOperand(0);
8120 MachineOperand &Src = Inst.getOperand(1);
8121
8122 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8123 const TargetRegisterClass *SrcRC = Src.isReg() ?
8124 MRI.getRegClass(Src.getReg()) :
8125 &AMDGPU::SGPR_32RegClass;
8126
8127 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8128 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8129
8130 const TargetRegisterClass *SrcSubRC =
8131 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8132
8133 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8134 AMDGPU::sub0, SrcSubRC);
8135 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8136 AMDGPU::sub1, SrcSubRC);
8137
8138 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8139
8140 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8141
8142 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8143
8144 // We don't need to legalize operands here. src0 for either instruction can be
8145 // an SGPR, and the second input is unused or determined here.
8146 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8147}
8148
8149void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8150 MachineInstr &Inst) const {
8151 MachineBasicBlock &MBB = *Inst.getParent();
8153 MachineBasicBlock::iterator MII = Inst;
8154 const DebugLoc &DL = Inst.getDebugLoc();
8155
8156 MachineOperand &Dest = Inst.getOperand(0);
8157 uint32_t Imm = Inst.getOperand(2).getImm();
8158 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8159 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8160
8161 (void) Offset;
8162
8163 // Only sext_inreg cases handled.
8164 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8165 Offset == 0 && "Not implemented");
8166
8167 if (BitWidth < 32) {
8168 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8169 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8170 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8171
8172 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8173 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8174 .addImm(0)
8175 .addImm(BitWidth);
8176
8177 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8178 .addImm(31)
8179 .addReg(MidRegLo);
8180
8181 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8182 .addReg(MidRegLo)
8183 .addImm(AMDGPU::sub0)
8184 .addReg(MidRegHi)
8185 .addImm(AMDGPU::sub1);
8186
8187 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8188 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8189 return;
8190 }
8191
8192 MachineOperand &Src = Inst.getOperand(1);
8193 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8194 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8195
8196 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8197 .addImm(31)
8198 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8199
8200 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8201 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8202 .addImm(AMDGPU::sub0)
8203 .addReg(TmpReg)
8204 .addImm(AMDGPU::sub1);
8205
8206 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8207 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8208}
8209
8210void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8211 MachineInstr &Inst, unsigned Opcode,
8212 MachineDominatorTree *MDT) const {
8213 // (S_FLBIT_I32_B64 hi:lo) ->
8214 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8215 // (S_FF1_I32_B64 hi:lo) ->
8216 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8217
8218 MachineBasicBlock &MBB = *Inst.getParent();
8220 MachineBasicBlock::iterator MII = Inst;
8221 const DebugLoc &DL = Inst.getDebugLoc();
8222
8223 MachineOperand &Dest = Inst.getOperand(0);
8224 MachineOperand &Src = Inst.getOperand(1);
8225
8226 const MCInstrDesc &InstDesc = get(Opcode);
8227
8228 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8229 unsigned OpcodeAdd =
8230 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8231
8232 const TargetRegisterClass *SrcRC =
8233 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8234 const TargetRegisterClass *SrcSubRC =
8235 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8236
8237 MachineOperand SrcRegSub0 =
8238 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8239 MachineOperand SrcRegSub1 =
8240 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8241
8242 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8243 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8244 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8245 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8246
8247 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8248
8249 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8250
8251 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8252 .addReg(IsCtlz ? MidReg1 : MidReg2)
8253 .addImm(32)
8254 .addImm(1); // enable clamp
8255
8256 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8257 .addReg(MidReg3)
8258 .addReg(IsCtlz ? MidReg2 : MidReg1);
8259
8260 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8261
8262 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8263}
8264
8265void SIInstrInfo::addUsersToMoveToVALUWorklist(
8267 SIInstrWorklist &Worklist) const {
8268 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8269 E = MRI.use_end(); I != E;) {
8270 MachineInstr &UseMI = *I->getParent();
8271
8272 unsigned OpNo = 0;
8273
8274 switch (UseMI.getOpcode()) {
8275 case AMDGPU::COPY:
8276 case AMDGPU::WQM:
8277 case AMDGPU::SOFT_WQM:
8278 case AMDGPU::STRICT_WWM:
8279 case AMDGPU::STRICT_WQM:
8280 case AMDGPU::REG_SEQUENCE:
8281 case AMDGPU::PHI:
8282 case AMDGPU::INSERT_SUBREG:
8283 break;
8284 default:
8285 OpNo = I.getOperandNo();
8286 break;
8287 }
8288
8289 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8290 Worklist.insert(&UseMI);
8291
8292 do {
8293 ++I;
8294 } while (I != E && I->getParent() == &UseMI);
8295 } else {
8296 ++I;
8297 }
8298 }
8299}
8300
8301void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8303 MachineInstr &Inst) const {
8304 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8306 MachineOperand &Src0 = Inst.getOperand(1);
8307 MachineOperand &Src1 = Inst.getOperand(2);
8308 const DebugLoc &DL = Inst.getDebugLoc();
8309
8310 switch (Inst.getOpcode()) {
8311 case AMDGPU::S_PACK_LL_B32_B16: {
8312 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8313 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8314
8315 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8316 // 0.
8317 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8318 .addImm(0xffff);
8319
8320 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8321 .addReg(ImmReg, RegState::Kill)
8322 .add(Src0);
8323
8324 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8325 .add(Src1)
8326 .addImm(16)
8327 .addReg(TmpReg, RegState::Kill);
8328 break;
8329 }
8330 case AMDGPU::S_PACK_LH_B32_B16: {
8331 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8332 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8333 .addImm(0xffff);
8334 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8335 .addReg(ImmReg, RegState::Kill)
8336 .add(Src0)
8337 .add(Src1);
8338 break;
8339 }
8340 case AMDGPU::S_PACK_HL_B32_B16: {
8341 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8342 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8343 .addImm(16)
8344 .add(Src0);
8345 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8346 .add(Src1)
8347 .addImm(16)
8348 .addReg(TmpReg, RegState::Kill);
8349 break;
8350 }
8351 case AMDGPU::S_PACK_HH_B32_B16: {
8352 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8353 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8354 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8355 .addImm(16)
8356 .add(Src0);
8357 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8358 .addImm(0xffff0000);
8359 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8360 .add(Src1)
8361 .addReg(ImmReg, RegState::Kill)
8362 .addReg(TmpReg, RegState::Kill);
8363 break;
8364 }
8365 default:
8366 llvm_unreachable("unhandled s_pack_* instruction");
8367 }
8368
8369 MachineOperand &Dest = Inst.getOperand(0);
8370 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8371 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8372}
8373
8374void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8375 MachineInstr &SCCDefInst,
8376 SIInstrWorklist &Worklist,
8377 Register NewCond) const {
8378
8379 // Ensure that def inst defines SCC, which is still live.
8380 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8381 !Op.isDead() && Op.getParent() == &SCCDefInst);
8382 SmallVector<MachineInstr *, 4> CopyToDelete;
8383 // This assumes that all the users of SCC are in the same block
8384 // as the SCC def.
8385 for (MachineInstr &MI : // Skip the def inst itself.
8386 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8387 SCCDefInst.getParent()->end())) {
8388 // Check if SCC is used first.
8389 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
8390 if (SCCIdx != -1) {
8391 if (MI.isCopy()) {
8392 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8393 Register DestReg = MI.getOperand(0).getReg();
8394
8395 MRI.replaceRegWith(DestReg, NewCond);
8396 CopyToDelete.push_back(&MI);
8397 } else {
8398
8399 if (NewCond.isValid())
8400 MI.getOperand(SCCIdx).setReg(NewCond);
8401
8402 Worklist.insert(&MI);
8403 }
8404 }
8405 // Exit if we find another SCC def.
8406 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8407 break;
8408 }
8409 for (auto &Copy : CopyToDelete)
8410 Copy->eraseFromParent();
8411}
8412
8413// Instructions that use SCC may be converted to VALU instructions. When that
8414// happens, the SCC register is changed to VCC_LO. The instruction that defines
8415// SCC must be changed to an instruction that defines VCC. This function makes
8416// sure that the instruction that defines SCC is added to the moveToVALU
8417// worklist.
8418void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8419 SIInstrWorklist &Worklist) const {
8420 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8421 // then there is nothing to do because the defining instruction has been
8422 // converted to a VALU already. If SCC then that instruction needs to be
8423 // converted to a VALU.
8424 for (MachineInstr &MI :
8425 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8426 SCCUseInst->getParent()->rend())) {
8427 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8428 break;
8429 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8430 Worklist.insert(&MI);
8431 break;
8432 }
8433 }
8434}
8435
8436const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8437 const MachineInstr &Inst) const {
8438 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8439
8440 switch (Inst.getOpcode()) {
8441 // For target instructions, getOpRegClass just returns the virtual register
8442 // class associated with the operand, so we need to find an equivalent VGPR
8443 // register class in order to move the instruction to the VALU.
8444 case AMDGPU::COPY:
8445 case AMDGPU::PHI:
8446 case AMDGPU::REG_SEQUENCE:
8447 case AMDGPU::INSERT_SUBREG:
8448 case AMDGPU::WQM:
8449 case AMDGPU::SOFT_WQM:
8450 case AMDGPU::STRICT_WWM:
8451 case AMDGPU::STRICT_WQM: {
8452 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8453 if (RI.isAGPRClass(SrcRC)) {
8454 if (RI.isAGPRClass(NewDstRC))
8455 return nullptr;
8456
8457 switch (Inst.getOpcode()) {
8458 case AMDGPU::PHI:
8459 case AMDGPU::REG_SEQUENCE:
8460 case AMDGPU::INSERT_SUBREG:
8461 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8462 break;
8463 default:
8464 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8465 }
8466
8467 if (!NewDstRC)
8468 return nullptr;
8469 } else {
8470 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8471 return nullptr;
8472
8473 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8474 if (!NewDstRC)
8475 return nullptr;
8476 }
8477
8478 return NewDstRC;
8479 }
8480 default:
8481 return NewDstRC;
8482 }
8483}
8484
8485// Find the one SGPR operand we are allowed to use.
8486Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8487 int OpIndices[3]) const {
8488 const MCInstrDesc &Desc = MI.getDesc();
8489
8490 // Find the one SGPR operand we are allowed to use.
8491 //
8492 // First we need to consider the instruction's operand requirements before
8493 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8494 // of VCC, but we are still bound by the constant bus requirement to only use
8495 // one.
8496 //
8497 // If the operand's class is an SGPR, we can never move it.
8498
8499 Register SGPRReg = findImplicitSGPRRead(MI);
8500 if (SGPRReg)
8501 return SGPRReg;
8502
8503 Register UsedSGPRs[3] = {Register()};
8504 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8505
8506 for (unsigned i = 0; i < 3; ++i) {
8507 int Idx = OpIndices[i];
8508 if (Idx == -1)
8509 break;
8510
8511 const MachineOperand &MO = MI.getOperand(Idx);
8512 if (!MO.isReg())
8513 continue;
8514
8515 // Is this operand statically required to be an SGPR based on the operand
8516 // constraints?
8517 const TargetRegisterClass *OpRC =
8518 RI.getRegClass(Desc.operands()[Idx].RegClass);
8519 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8520 if (IsRequiredSGPR)
8521 return MO.getReg();
8522
8523 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8524 Register Reg = MO.getReg();
8525 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8526 if (RI.isSGPRClass(RegRC))
8527 UsedSGPRs[i] = Reg;
8528 }
8529
8530 // We don't have a required SGPR operand, so we have a bit more freedom in
8531 // selecting operands to move.
8532
8533 // Try to select the most used SGPR. If an SGPR is equal to one of the
8534 // others, we choose that.
8535 //
8536 // e.g.
8537 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8538 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8539
8540 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8541 // prefer those.
8542
8543 if (UsedSGPRs[0]) {
8544 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8545 SGPRReg = UsedSGPRs[0];
8546 }
8547
8548 if (!SGPRReg && UsedSGPRs[1]) {
8549 if (UsedSGPRs[1] == UsedSGPRs[2])
8550 SGPRReg = UsedSGPRs[1];
8551 }
8552
8553 return SGPRReg;
8554}
8555
8557 unsigned OperandName) const {
8558 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8559 if (Idx == -1)
8560 return nullptr;
8561
8562 return &MI.getOperand(Idx);
8563}
8564
8570 return (Format << 44) |
8571 (1ULL << 56) | // RESOURCE_LEVEL = 1
8572 (3ULL << 60); // OOB_SELECT = 3
8573 }
8574
8575 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8576 if (ST.isAmdHsaOS()) {
8577 // Set ATC = 1. GFX9 doesn't have this bit.
8579 RsrcDataFormat |= (1ULL << 56);
8580
8581 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8582 // BTW, it disables TC L2 and therefore decreases performance.
8584 RsrcDataFormat |= (2ULL << 59);
8585 }
8586
8587 return RsrcDataFormat;
8588}
8589
8593 0xffffffff; // Size;
8594
8595 // GFX9 doesn't have ELEMENT_SIZE.
8597 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8598 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8599 }
8600
8601 // IndexStride = 64 / 32.
8602 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
8603 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8604
8605 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8606 // Clear them unless we want a huge stride.
8609 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8610
8611 return Rsrc23;
8612}
8613
8615 unsigned Opc = MI.getOpcode();
8616
8617 return isSMRD(Opc);
8618}
8619
8621 return get(Opc).mayLoad() &&
8622 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8623}
8624
8626 int &FrameIndex) const {
8627 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8628 if (!Addr || !Addr->isFI())
8629 return Register();
8630
8631 assert(!MI.memoperands_empty() &&
8632 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8633
8634 FrameIndex = Addr->getIndex();
8635 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8636}
8637
8639 int &FrameIndex) const {
8640 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8641 assert(Addr && Addr->isFI());
8642 FrameIndex = Addr->getIndex();
8643 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8644}
8645
8647 int &FrameIndex) const {
8648 if (!MI.mayLoad())
8649 return Register();
8650
8651 if (isMUBUF(MI) || isVGPRSpill(MI))
8652 return isStackAccess(MI, FrameIndex);
8653
8654 if (isSGPRSpill(MI))
8655 return isSGPRStackAccess(MI, FrameIndex);
8656
8657 return Register();
8658}
8659
8661 int &FrameIndex) const {
8662 if (!MI.mayStore())
8663 return Register();
8664
8665 if (isMUBUF(MI) || isVGPRSpill(MI))
8666 return isStackAccess(MI, FrameIndex);
8667
8668 if (isSGPRSpill(MI))
8669 return isSGPRStackAccess(MI, FrameIndex);
8670
8671 return Register();
8672}
8673
8675 unsigned Size = 0;
8677 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8678 while (++I != E && I->isInsideBundle()) {
8679 assert(!I->isBundle() && "No nested bundle!");
8681 }
8682
8683 return Size;
8684}
8685
8687 unsigned Opc = MI.getOpcode();
8689 unsigned DescSize = Desc.getSize();
8690
8691 // If we have a definitive size, we can use it. Otherwise we need to inspect
8692 // the operands to know the size.
8693 if (isFixedSize(MI)) {
8694 unsigned Size = DescSize;
8695
8696 // If we hit the buggy offset, an extra nop will be inserted in MC so
8697 // estimate the worst case.
8698 if (MI.isBranch() && ST.hasOffset3fBug())
8699 Size += 4;
8700
8701 return Size;
8702 }
8703
8704 // Instructions may have a 32-bit literal encoded after them. Check
8705 // operands that could ever be literals.
8706 if (isVALU(MI) || isSALU(MI)) {
8707 if (isDPP(MI))
8708 return DescSize;
8709 bool HasLiteral = false;
8710 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8711 const MachineOperand &Op = MI.getOperand(I);
8712 const MCOperandInfo &OpInfo = Desc.operands()[I];
8713 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8714 HasLiteral = true;
8715 break;
8716 }
8717 }
8718 return HasLiteral ? DescSize + 4 : DescSize;
8719 }
8720
8721 // Check whether we have extra NSA words.
8722 if (isMIMG(MI)) {
8723 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8724 if (VAddr0Idx < 0)
8725 return 8;
8726
8727 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8728 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8729 }
8730
8731 switch (Opc) {
8732 case TargetOpcode::BUNDLE:
8733 return getInstBundleSize(MI);
8734 case TargetOpcode::INLINEASM:
8735 case TargetOpcode::INLINEASM_BR: {
8736 const MachineFunction *MF = MI.getParent()->getParent();
8737 const char *AsmStr = MI.getOperand(0).getSymbolName();
8738 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8739 }
8740 default:
8741 if (MI.isMetaInstruction())
8742 return 0;
8743 return DescSize;
8744 }
8745}
8746
8748 if (!isFLAT(MI))
8749 return false;
8750
8751 if (MI.memoperands_empty())
8752 return true;
8753
8754 for (const MachineMemOperand *MMO : MI.memoperands()) {
8755 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8756 return true;
8757 }
8758 return false;
8759}
8760
8762 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
8763}
8764
8766 MachineBasicBlock *IfEnd) const {
8768 assert(TI != IfEntry->end());
8769
8770 MachineInstr *Branch = &(*TI);
8771 MachineFunction *MF = IfEntry->getParent();
8773
8774 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8775 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8776 MachineInstr *SIIF =
8777 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
8778 .add(Branch->getOperand(0))
8779 .add(Branch->getOperand(1));
8780 MachineInstr *SIEND =
8781 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
8782 .addReg(DstReg);
8783
8784 IfEntry->erase(TI);
8785 IfEntry->insert(IfEntry->end(), SIIF);
8786 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
8787 }
8788}
8789
8791 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
8793 // We expect 2 terminators, one conditional and one unconditional.
8794 assert(TI != LoopEnd->end());
8795
8796 MachineInstr *Branch = &(*TI);
8797 MachineFunction *MF = LoopEnd->getParent();
8799
8800 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8801
8802 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8803 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
8804 MachineInstrBuilder HeaderPHIBuilder =
8805 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
8806 for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) {
8807 if (PMBB == LoopEnd) {
8808 HeaderPHIBuilder.addReg(BackEdgeReg);
8809 } else {
8810 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
8811 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
8812 ZeroReg, 0);
8813 HeaderPHIBuilder.addReg(ZeroReg);
8814 }
8815 HeaderPHIBuilder.addMBB(PMBB);
8816 }
8817 MachineInstr *HeaderPhi = HeaderPHIBuilder;
8818 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
8819 get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
8820 .addReg(DstReg)
8821 .add(Branch->getOperand(0));
8822 MachineInstr *SILOOP =
8823 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
8824 .addReg(BackEdgeReg)
8825 .addMBB(LoopEntry);
8826
8827 LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
8828 LoopEnd->erase(TI);
8829 LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
8830 LoopEnd->insert(LoopEnd->end(), SILOOP);
8831 }
8832}
8833
8836 static const std::pair<int, const char *> TargetIndices[] = {
8837 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8838 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8839 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8840 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8841 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8842 return ArrayRef(TargetIndices);
8843}
8844
8845/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8846/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8849 const ScheduleDAG *DAG) const {
8850 return new GCNHazardRecognizer(DAG->MF);
8851}
8852
8853/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8854/// pass.
8857 return new GCNHazardRecognizer(MF);
8858}
8859
8860// Called during:
8861// - pre-RA scheduling and post-RA scheduling
8864 const ScheduleDAGMI *DAG) const {
8865 // Borrowed from Arm Target
8866 // We would like to restrict this hazard recognizer to only
8867 // post-RA scheduling; we can tell that we're post-RA because we don't
8868 // track VRegLiveness.
8869 if (!DAG->hasVRegLiveness())
8870 return new GCNHazardRecognizer(DAG->MF);
8872}
8873
8874std::pair<unsigned, unsigned>
8876 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
8877}
8878
8881 static const std::pair<unsigned, const char *> TargetFlags[] = {
8882 { MO_GOTPCREL, "amdgpu-gotprel" },
8883 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
8884 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
8885 { MO_REL32_LO, "amdgpu-rel32-lo" },
8886 { MO_REL32_HI, "amdgpu-rel32-hi" },
8887 { MO_ABS32_LO, "amdgpu-abs32-lo" },
8888 { MO_ABS32_HI, "amdgpu-abs32-hi" },
8889 };
8890
8891 return ArrayRef(TargetFlags);
8892}
8893
8896 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8897 {
8898 {MONoClobber, "amdgpu-noclobber"},
8899 {MOLastUse, "amdgpu-last-use"},
8900 };
8901
8902 return ArrayRef(TargetFlags);
8903}
8904
8906 const MachineFunction &MF) const {
8908 assert(SrcReg.isVirtual());
8909 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8910 return AMDGPU::WWM_COPY;
8911
8912 return AMDGPU::COPY;
8913}
8914
8916 Register Reg) const {
8917 // We need to handle instructions which may be inserted during register
8918 // allocation to handle the prolog. The initial prolog instruction may have
8919 // been separated from the start of the block by spills and copies inserted
8920 // needed by the prolog. However, the insertions for scalar registers can
8921 // always be placed at the BB top as they are independent of the exec mask
8922 // value.
8923 bool IsNullOrVectorRegister = true;
8924 if (Reg) {
8925 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8926 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
8927 }
8928
8929 uint16_t Opcode = MI.getOpcode();
8930 // FIXME: Copies inserted in the block prolog for live-range split should also
8931 // be included.
8932 return IsNullOrVectorRegister &&
8933 (isSpill(Opcode) || (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
8934 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
8935}
8936
8940 const DebugLoc &DL,
8941 Register DestReg) const {
8942 if (ST.hasAddNoCarry())
8943 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
8944
8946 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
8947 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
8948
8949 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8950 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8951}
8952
8955 const DebugLoc &DL,
8956 Register DestReg,
8957 RegScavenger &RS) const {
8958 if (ST.hasAddNoCarry())
8959 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
8960
8961 // If available, prefer to use vcc.
8962 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
8963 ? Register(RI.getVCC())
8965 *RI.getBoolRC(), I, /* RestoreAfter */ false,
8966 0, /* AllowSpill */ false);
8967
8968 // TODO: Users need to deal with this.
8969 if (!UnusedCarry.isValid())
8970 return MachineInstrBuilder();
8971
8972 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8973 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8974}
8975
8976bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
8977 switch (Opcode) {
8978 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
8979 case AMDGPU::SI_KILL_I1_TERMINATOR:
8980 return true;
8981 default:
8982 return false;
8983 }
8984}
8985
8987 switch (Opcode) {
8988 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
8989 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
8990 case AMDGPU::SI_KILL_I1_PSEUDO:
8991 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
8992 default:
8993 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
8994 }
8995}
8996
8997bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
8998 return Imm <= getMaxMUBUFImmOffset(ST);
8999}
9000
9002 // GFX12 field is non-negative 24-bit signed byte offset.
9003 const unsigned OffsetBits =
9004 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9005 return (1 << OffsetBits) - 1;
9006}
9007
9009 if (!ST.isWave32())
9010 return;
9011
9012 if (MI.isInlineAsm())
9013 return;
9014
9015 for (auto &Op : MI.implicit_operands()) {
9016 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9017 Op.setReg(AMDGPU::VCC_LO);
9018 }
9019}
9020
9022 if (!isSMRD(MI))
9023 return false;
9024
9025 // Check that it is using a buffer resource.
9026 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9027 if (Idx == -1) // e.g. s_memtime
9028 return false;
9029
9030 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9031 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9032}
9033
9034// Given Imm, split it into the values to put into the SOffset and ImmOffset
9035// fields in an MUBUF instruction. Return false if it is not possible (due to a
9036// hardware bug needing a workaround).
9037//
9038// The required alignment ensures that individual address components remain
9039// aligned if they are aligned to begin with. It also ensures that additional
9040// offsets within the given alignment can be added to the resulting ImmOffset.
9042 uint32_t &ImmOffset, Align Alignment) const {
9043 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9044 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9045 uint32_t Overflow = 0;
9046
9047 if (Imm > MaxImm) {
9048 if (Imm <= MaxImm + 64) {
9049 // Use an SOffset inline constant for 4..64
9050 Overflow = Imm - MaxImm;
9051 Imm = MaxImm;
9052 } else {
9053 // Try to keep the same value in SOffset for adjacent loads, so that
9054 // the corresponding register contents can be re-used.
9055 //
9056 // Load values with all low-bits (except for alignment bits) set into
9057 // SOffset, so that a larger range of values can be covered using
9058 // s_movk_i32.
9059 //
9060 // Atomic operations fail to work correctly when individual address
9061 // components are unaligned, even if their sum is aligned.
9062 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9063 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9064 Imm = Low;
9065 Overflow = High - Alignment.value();
9066 }
9067 }
9068
9069 if (Overflow > 0) {
9070 // There is a hardware bug in SI and CI which prevents address clamping in
9071 // MUBUF instructions from working correctly with SOffsets. The immediate
9072 // offset is unaffected.
9074 return false;
9075
9076 // It is not possible to set immediate in SOffset field on some targets.
9077 if (ST.hasRestrictedSOffset())
9078 return false;
9079 }
9080
9081 ImmOffset = Imm;
9082 SOffset = Overflow;
9083 return true;
9084}
9085
9086// Depending on the used address space and instructions, some immediate offsets
9087// are allowed and some are not.
9088// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9089// scratch instruction offsets can also be negative. On GFX12, offsets can be
9090// negative for all variants.
9091//
9092// There are several bugs related to these offsets:
9093// On gfx10.1, flat instructions that go into the global address space cannot
9094// use an offset.
9095//
9096// For scratch instructions, the address can be either an SGPR or a VGPR.
9097// The following offsets can be used, depending on the architecture (x means
9098// cannot be used):
9099// +----------------------------+------+------+
9100// | Address-Mode | SGPR | VGPR |
9101// +----------------------------+------+------+
9102// | gfx9 | | |
9103// | negative, 4-aligned offset | x | ok |
9104// | negative, unaligned offset | x | ok |
9105// +----------------------------+------+------+
9106// | gfx10 | | |
9107// | negative, 4-aligned offset | ok | ok |
9108// | negative, unaligned offset | ok | x |
9109// +----------------------------+------+------+
9110// | gfx10.3 | | |
9111// | negative, 4-aligned offset | ok | ok |
9112// | negative, unaligned offset | ok | ok |
9113// +----------------------------+------+------+
9114//
9115// This function ignores the addressing mode, so if an offset cannot be used in
9116// one addressing mode, it is considered illegal.
9117bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9118 uint64_t FlatVariant) const {
9119 // TODO: Should 0 be special cased?
9120 if (!ST.hasFlatInstOffsets())
9121 return false;
9122
9123 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9124 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9125 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9126 return false;
9127
9129 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9130 (Offset % 4) != 0) {
9131 return false;
9132 }
9133
9134 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9135 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9136 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9137}
9138
9139// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9140std::pair<int64_t, int64_t>
9141SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9142 uint64_t FlatVariant) const {
9143 int64_t RemainderOffset = COffsetVal;
9144 int64_t ImmField = 0;
9145
9146 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9147 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9148
9149 if (AllowNegative) {
9150 // Use signed division by a power of two to truncate towards 0.
9151 int64_t D = 1LL << NumBits;
9152 RemainderOffset = (COffsetVal / D) * D;
9153 ImmField = COffsetVal - RemainderOffset;
9154
9156 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9157 (ImmField % 4) != 0) {
9158 // Make ImmField a multiple of 4
9159 RemainderOffset += ImmField % 4;
9160 ImmField -= ImmField % 4;
9161 }
9162 } else if (COffsetVal >= 0) {
9163 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9164 RemainderOffset = COffsetVal - ImmField;
9165 }
9166
9167 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9168 assert(RemainderOffset + ImmField == COffsetVal);
9169 return {ImmField, RemainderOffset};
9170}
9171
9173 if (ST.hasNegativeScratchOffsetBug() &&
9174 FlatVariant == SIInstrFlags::FlatScratch)
9175 return false;
9176
9177 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9178}
9179
9180static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9181 switch (ST.getGeneration()) {
9182 default:
9183 break;
9186 return SIEncodingFamily::SI;
9189 return SIEncodingFamily::VI;
9196 }
9197 llvm_unreachable("Unknown subtarget generation!");
9198}
9199
9200bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9201 switch(MCOp) {
9202 // These opcodes use indirect register addressing so
9203 // they need special handling by codegen (currently missing).
9204 // Therefore it is too risky to allow these opcodes
9205 // to be selected by dpp combiner or sdwa peepholer.
9206 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9207 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9208 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9209 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9210 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9211 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9212 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9213 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9214 return true;
9215 default:
9216 return false;
9217 }
9218}
9219
9220int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9221 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9222
9223 unsigned Gen = subtargetEncodingFamily(ST);
9224
9225 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
9228
9229 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9230 // subtarget has UnpackedD16VMem feature.
9231 // TODO: remove this when we discard GFX80 encoding.
9232 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9234
9235 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9236 switch (ST.getGeneration()) {
9237 default:
9239 break;
9242 break;
9245 break;
9246 }
9247 }
9248
9249 if (isMAI(Opcode)) {
9250 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9251 if (MFMAOp != -1)
9252 Opcode = MFMAOp;
9253 }
9254
9255 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9256
9257 // -1 means that Opcode is already a native instruction.
9258 if (MCOp == -1)
9259 return Opcode;
9260
9261 if (ST.hasGFX90AInsts()) {
9262 uint16_t NMCOp = (uint16_t)-1;
9263 if (ST.hasGFX940Insts())
9265 if (NMCOp == (uint16_t)-1)
9267 if (NMCOp == (uint16_t)-1)
9269 if (NMCOp != (uint16_t)-1)
9270 MCOp = NMCOp;
9271 }
9272
9273 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9274 // no encoding in the given subtarget generation.
9275 if (MCOp == (uint16_t)-1)
9276 return -1;
9277
9278 if (isAsmOnlyOpcode(MCOp))
9279 return -1;
9280
9281 return MCOp;
9282}
9283
9284static
9286 assert(RegOpnd.isReg());
9287 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9288 getRegSubRegPair(RegOpnd);
9289}
9290
9293 assert(MI.isRegSequence());
9294 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9295 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9296 auto &RegOp = MI.getOperand(1 + 2 * I);
9297 return getRegOrUndef(RegOp);
9298 }
9300}
9301
9302// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9303// Following a subreg of reg:subreg isn't supported
9306 if (!RSR.SubReg)
9307 return false;
9308 switch (MI.getOpcode()) {
9309 default: break;
9310 case AMDGPU::REG_SEQUENCE:
9311 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9312 return true;
9313 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9314 case AMDGPU::INSERT_SUBREG:
9315 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9316 // inserted the subreg we're looking for
9317 RSR = getRegOrUndef(MI.getOperand(2));
9318 else { // the subreg in the rest of the reg
9319 auto R1 = getRegOrUndef(MI.getOperand(1));
9320 if (R1.SubReg) // subreg of subreg isn't supported
9321 return false;
9322 RSR.Reg = R1.Reg;
9323 }
9324 return true;
9325 }
9326 return false;
9327}
9328
9331 assert(MRI.isSSA());
9332 if (!P.Reg.isVirtual())
9333 return nullptr;
9334
9335 auto RSR = P;
9336 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9337 while (auto *MI = DefInst) {
9338 DefInst = nullptr;
9339 switch (MI->getOpcode()) {
9340 case AMDGPU::COPY:
9341 case AMDGPU::V_MOV_B32_e32: {
9342 auto &Op1 = MI->getOperand(1);
9343 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9344 if (Op1.isUndef())
9345 return nullptr;
9346 RSR = getRegSubRegPair(Op1);
9347 DefInst = MRI.getVRegDef(RSR.Reg);
9348 }
9349 break;
9350 }
9351 default:
9352 if (followSubRegDef(*MI, RSR)) {
9353 if (!RSR.Reg)
9354 return nullptr;
9355 DefInst = MRI.getVRegDef(RSR.Reg);
9356 }
9357 }
9358 if (!DefInst)
9359 return MI;
9360 }
9361 return nullptr;
9362}
9363
9365 Register VReg,
9366 const MachineInstr &DefMI,
9367 const MachineInstr &UseMI) {
9368 assert(MRI.isSSA() && "Must be run on SSA");
9369
9370 auto *TRI = MRI.getTargetRegisterInfo();
9371 auto *DefBB = DefMI.getParent();
9372
9373 // Don't bother searching between blocks, although it is possible this block
9374 // doesn't modify exec.
9375 if (UseMI.getParent() != DefBB)
9376 return true;
9377
9378 const int MaxInstScan = 20;
9379 int NumInst = 0;
9380
9381 // Stop scan at the use.
9382 auto E = UseMI.getIterator();
9383 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9384 if (I->isDebugInstr())
9385 continue;
9386
9387 if (++NumInst > MaxInstScan)
9388 return true;
9389
9390 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9391 return true;
9392 }
9393
9394 return false;
9395}
9396
9398 Register VReg,
9399 const MachineInstr &DefMI) {
9400 assert(MRI.isSSA() && "Must be run on SSA");
9401
9402 auto *TRI = MRI.getTargetRegisterInfo();
9403 auto *DefBB = DefMI.getParent();
9404
9405 const int MaxUseScan = 10;
9406 int NumUse = 0;
9407
9408 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9409 auto &UseInst = *Use.getParent();
9410 // Don't bother searching between blocks, although it is possible this block
9411 // doesn't modify exec.
9412 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9413 return true;
9414
9415 if (++NumUse > MaxUseScan)
9416 return true;
9417 }
9418
9419 if (NumUse == 0)
9420 return false;
9421
9422 const int MaxInstScan = 20;
9423 int NumInst = 0;
9424
9425 // Stop scan when we have seen all the uses.
9426 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9427 assert(I != DefBB->end());
9428
9429 if (I->isDebugInstr())
9430 continue;
9431
9432 if (++NumInst > MaxInstScan)
9433 return true;
9434
9435 for (const MachineOperand &Op : I->operands()) {
9436 // We don't check reg masks here as they're used only on calls:
9437 // 1. EXEC is only considered const within one BB
9438 // 2. Call should be a terminator instruction if present in a BB
9439
9440 if (!Op.isReg())
9441 continue;
9442
9443 Register Reg = Op.getReg();
9444 if (Op.isUse()) {
9445 if (Reg == VReg && --NumUse == 0)
9446 return false;
9447 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9448 return true;
9449 }
9450 }
9451}
9452
9455 const DebugLoc &DL, Register Src, Register Dst) const {
9456 auto Cur = MBB.begin();
9457 if (Cur != MBB.end())
9458 do {
9459 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9460 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9461 ++Cur;
9462 } while (Cur != MBB.end() && Cur != LastPHIIt);
9463
9464 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9465 Dst);
9466}
9467
9470 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9471 if (InsPt != MBB.end() &&
9472 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9473 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9474 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9475 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9476 InsPt++;
9477 return BuildMI(MBB, InsPt, DL,
9478 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9479 : AMDGPU::S_MOV_B64_term),
9480 Dst)
9481 .addReg(Src, 0, SrcSubReg)
9482 .addReg(AMDGPU::EXEC, RegState::Implicit);
9483 }
9484 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9485 Dst);
9486}
9487
9488bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9489
9492 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9493 VirtRegMap *VRM) const {
9494 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9495 //
9496 // %0:sreg_32 = COPY $m0
9497 //
9498 // We explicitly chose SReg_32 for the virtual register so such a copy might
9499 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9500 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9501 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9502 // TargetInstrInfo::foldMemoryOperand() is going to try.
9503 // A similar issue also exists with spilling and reloading $exec registers.
9504 //
9505 // To prevent that, constrain the %0 register class here.
9506 if (isFullCopyInstr(MI)) {
9507 Register DstReg = MI.getOperand(0).getReg();
9508 Register SrcReg = MI.getOperand(1).getReg();
9509 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9510 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9512 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9513 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9514 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9515 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9516 return nullptr;
9517 }
9518 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9519 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9520 return nullptr;
9521 }
9522 }
9523 }
9524
9525 return nullptr;
9526}
9527
9529 const MachineInstr &MI,
9530 unsigned *PredCost) const {
9531 if (MI.isBundle()) {
9533 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9534 unsigned Lat = 0, Count = 0;
9535 for (++I; I != E && I->isBundledWithPred(); ++I) {
9536 ++Count;
9537 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9538 }
9539 return Lat + Count - 1;
9540 }
9541
9542 return SchedModel.computeInstrLatency(&MI);
9543}
9544
9547 unsigned opcode = MI.getOpcode();
9548 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9549 auto IID = GI->getIntrinsicID();
9554
9555 switch (IID) {
9556 case Intrinsic::amdgcn_if:
9557 case Intrinsic::amdgcn_else:
9558 // FIXME: Uniform if second result
9559 break;
9560 }
9561
9563 }
9564
9565 // Loads from the private and flat address spaces are divergent, because
9566 // threads can execute the load instruction with the same inputs and get
9567 // different results.
9568 //
9569 // All other loads are not divergent, because if threads issue loads with the
9570 // same arguments, they will always get the same result.
9571 if (opcode == AMDGPU::G_LOAD) {
9572 if (MI.memoperands_empty())
9573 return InstructionUniformity::NeverUniform; // conservative assumption
9574
9575 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9576 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9577 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9578 })) {
9579 // At least one MMO in a non-global address space.
9581 }
9583 }
9584
9585 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9586 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9587 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9588 AMDGPU::isGenericAtomic(opcode)) {
9590 }
9592}
9593
9596
9597 if (isNeverUniform(MI))
9599
9600 unsigned opcode = MI.getOpcode();
9601 if (opcode == AMDGPU::V_READLANE_B32 ||
9602 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9603 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9605
9606 if (isCopyInstr(MI)) {
9607 const MachineOperand &srcOp = MI.getOperand(1);
9608 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9609 const TargetRegisterClass *regClass =
9610 RI.getPhysRegBaseClass(srcOp.getReg());
9613 }
9615 }
9616
9617 // GMIR handling
9618 if (MI.isPreISelOpcode())
9620
9621 // Atomics are divergent because they are executed sequentially: when an
9622 // atomic operation refers to the same address in each thread, then each
9623 // thread after the first sees the value written by the previous thread as
9624 // original value.
9625
9626 if (isAtomic(MI))
9628
9629 // Loads from the private and flat address spaces are divergent, because
9630 // threads can execute the load instruction with the same inputs and get
9631 // different results.
9632 if (isFLAT(MI) && MI.mayLoad()) {
9633 if (MI.memoperands_empty())
9634 return InstructionUniformity::NeverUniform; // conservative assumption
9635
9636 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9637 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9638 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9639 })) {
9640 // At least one MMO in a non-global address space.
9642 }
9643
9645 }
9646
9647 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9648 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9649
9650 // FIXME: It's conceptually broken to report this for an instruction, and not
9651 // a specific def operand. For inline asm in particular, there could be mixed
9652 // uniform and divergent results.
9653 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9654 const MachineOperand &SrcOp = MI.getOperand(I);
9655 if (!SrcOp.isReg())
9656 continue;
9657
9658 Register Reg = SrcOp.getReg();
9659 if (!Reg || !SrcOp.readsReg())
9660 continue;
9661
9662 // If RegBank is null, this is unassigned or an unallocatable special
9663 // register, which are all scalars.
9664 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9665 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9667 }
9668
9669 // TODO: Uniformity check condtions above can be rearranged for more
9670 // redability
9671
9672 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9673 // currently turned into no-op COPYs by SelectionDAG ISel and are
9674 // therefore no longer recognizable.
9675
9677}
9678
9680 switch (MF.getFunction().getCallingConv()) {
9682 return 1;
9684 return 2;
9686 return 3;
9690 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9693 case CallingConv::C:
9694 case CallingConv::Fast:
9695 default:
9696 // Assume other calling conventions are various compute callable functions
9697 return 0;
9698 }
9699}
9700
9702 Register &SrcReg2, int64_t &CmpMask,
9703 int64_t &CmpValue) const {
9704 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9705 return false;
9706
9707 switch (MI.getOpcode()) {
9708 default:
9709 break;
9710 case AMDGPU::S_CMP_EQ_U32:
9711 case AMDGPU::S_CMP_EQ_I32:
9712 case AMDGPU::S_CMP_LG_U32:
9713 case AMDGPU::S_CMP_LG_I32:
9714 case AMDGPU::S_CMP_LT_U32:
9715 case AMDGPU::S_CMP_LT_I32:
9716 case AMDGPU::S_CMP_GT_U32:
9717 case AMDGPU::S_CMP_GT_I32:
9718 case AMDGPU::S_CMP_LE_U32:
9719 case AMDGPU::S_CMP_LE_I32:
9720 case AMDGPU::S_CMP_GE_U32:
9721 case AMDGPU::S_CMP_GE_I32:
9722 case AMDGPU::S_CMP_EQ_U64:
9723 case AMDGPU::S_CMP_LG_U64:
9724 SrcReg = MI.getOperand(0).getReg();
9725 if (MI.getOperand(1).isReg()) {
9726 if (MI.getOperand(1).getSubReg())
9727 return false;
9728 SrcReg2 = MI.getOperand(1).getReg();
9729 CmpValue = 0;
9730 } else if (MI.getOperand(1).isImm()) {
9731 SrcReg2 = Register();
9732 CmpValue = MI.getOperand(1).getImm();
9733 } else {
9734 return false;
9735 }
9736 CmpMask = ~0;
9737 return true;
9738 case AMDGPU::S_CMPK_EQ_U32:
9739 case AMDGPU::S_CMPK_EQ_I32:
9740 case AMDGPU::S_CMPK_LG_U32:
9741 case AMDGPU::S_CMPK_LG_I32:
9742 case AMDGPU::S_CMPK_LT_U32:
9743 case AMDGPU::S_CMPK_LT_I32:
9744 case AMDGPU::S_CMPK_GT_U32:
9745 case AMDGPU::S_CMPK_GT_I32:
9746 case AMDGPU::S_CMPK_LE_U32:
9747 case AMDGPU::S_CMPK_LE_I32:
9748 case AMDGPU::S_CMPK_GE_U32:
9749 case AMDGPU::S_CMPK_GE_I32:
9750 SrcReg = MI.getOperand(0).getReg();
9751 SrcReg2 = Register();
9752 CmpValue = MI.getOperand(1).getImm();
9753 CmpMask = ~0;
9754 return true;
9755 }
9756
9757 return false;
9758}
9759
9761 Register SrcReg2, int64_t CmpMask,
9762 int64_t CmpValue,
9763 const MachineRegisterInfo *MRI) const {
9764 if (!SrcReg || SrcReg.isPhysical())
9765 return false;
9766
9767 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9768 return false;
9769
9770 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9771 this](int64_t ExpectedValue, unsigned SrcSize,
9772 bool IsReversible, bool IsSigned) -> bool {
9773 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9774 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9775 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9776 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9777 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9778 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9779 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9780 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9781 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9782 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9783 //
9784 // Signed ge/gt are not used for the sign bit.
9785 //
9786 // If result of the AND is unused except in the compare:
9787 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9788 //
9789 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9790 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9791 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9792 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9793 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9794 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9795
9796 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9797 if (!Def || Def->getParent() != CmpInstr.getParent())
9798 return false;
9799
9800 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9801 Def->getOpcode() != AMDGPU::S_AND_B64)
9802 return false;
9803
9804 int64_t Mask;
9805 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9806 if (MO->isImm())
9807 Mask = MO->getImm();
9808 else if (!getFoldableImm(MO, Mask))
9809 return false;
9810 Mask &= maxUIntN(SrcSize);
9811 return isPowerOf2_64(Mask);
9812 };
9813
9814 MachineOperand *SrcOp = &Def->getOperand(1);
9815 if (isMask(SrcOp))
9816 SrcOp = &Def->getOperand(2);
9817 else if (isMask(&Def->getOperand(2)))
9818 SrcOp = &Def->getOperand(1);
9819 else
9820 return false;
9821
9822 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
9823 if (IsSigned && BitNo == SrcSize - 1)
9824 return false;
9825
9826 ExpectedValue <<= BitNo;
9827
9828 bool IsReversedCC = false;
9829 if (CmpValue != ExpectedValue) {
9830 if (!IsReversible)
9831 return false;
9832 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
9833 if (!IsReversedCC)
9834 return false;
9835 }
9836
9837 Register DefReg = Def->getOperand(0).getReg();
9838 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
9839 return false;
9840
9841 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
9842 I != E; ++I) {
9843 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
9844 I->killsRegister(AMDGPU::SCC, &RI))
9845 return false;
9846 }
9847
9848 MachineOperand *SccDef =
9849 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
9850 SccDef->setIsDead(false);
9851 CmpInstr.eraseFromParent();
9852
9853 if (!MRI->use_nodbg_empty(DefReg)) {
9854 assert(!IsReversedCC);
9855 return true;
9856 }
9857
9858 // Replace AND with unused result with a S_BITCMP.
9859 MachineBasicBlock *MBB = Def->getParent();
9860
9861 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
9862 : AMDGPU::S_BITCMP1_B32
9863 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
9864 : AMDGPU::S_BITCMP1_B64;
9865
9866 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
9867 .add(*SrcOp)
9868 .addImm(BitNo);
9869 Def->eraseFromParent();
9870
9871 return true;
9872 };
9873
9874 switch (CmpInstr.getOpcode()) {
9875 default:
9876 break;
9877 case AMDGPU::S_CMP_EQ_U32:
9878 case AMDGPU::S_CMP_EQ_I32:
9879 case AMDGPU::S_CMPK_EQ_U32:
9880 case AMDGPU::S_CMPK_EQ_I32:
9881 return optimizeCmpAnd(1, 32, true, false);
9882 case AMDGPU::S_CMP_GE_U32:
9883 case AMDGPU::S_CMPK_GE_U32:
9884 return optimizeCmpAnd(1, 32, false, false);
9885 case AMDGPU::S_CMP_GE_I32:
9886 case AMDGPU::S_CMPK_GE_I32:
9887 return optimizeCmpAnd(1, 32, false, true);
9888 case AMDGPU::S_CMP_EQ_U64:
9889 return optimizeCmpAnd(1, 64, true, false);
9890 case AMDGPU::S_CMP_LG_U32:
9891 case AMDGPU::S_CMP_LG_I32:
9892 case AMDGPU::S_CMPK_LG_U32:
9893 case AMDGPU::S_CMPK_LG_I32:
9894 return optimizeCmpAnd(0, 32, true, false);
9895 case AMDGPU::S_CMP_GT_U32:
9896 case AMDGPU::S_CMPK_GT_U32:
9897 return optimizeCmpAnd(0, 32, false, false);
9898 case AMDGPU::S_CMP_GT_I32:
9899 case AMDGPU::S_CMPK_GT_I32:
9900 return optimizeCmpAnd(0, 32, false, true);
9901 case AMDGPU::S_CMP_LG_U64:
9902 return optimizeCmpAnd(0, 64, true, false);
9903 }
9904
9905 return false;
9906}
9907
9909 unsigned OpName) const {
9910 if (!ST.needsAlignedVGPRs())
9911 return;
9912
9913 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
9914 if (OpNo < 0)
9915 return;
9916 MachineOperand &Op = MI.getOperand(OpNo);
9917 if (getOpSize(MI, OpNo) > 4)
9918 return;
9919
9920 // Add implicit aligned super-reg to force alignment on the data operand.
9921 const DebugLoc &DL = MI.getDebugLoc();
9922 MachineBasicBlock *BB = MI.getParent();
9924 Register DataReg = Op.getReg();
9925 bool IsAGPR = RI.isAGPR(MRI, DataReg);
9926 Register Undef = MRI.createVirtualRegister(
9927 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
9928 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
9929 Register NewVR =
9930 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
9931 : &AMDGPU::VReg_64_Align2RegClass);
9932 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
9933 .addReg(DataReg, 0, Op.getSubReg())
9934 .addImm(AMDGPU::sub0)
9935 .addReg(Undef)
9936 .addImm(AMDGPU::sub1);
9937 Op.setReg(NewVR);
9938 Op.setSubReg(AMDGPU::sub0);
9939 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
9940}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
amdgpu AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
TargetInstrInfo::RegSubRegPair RegSubRegPair
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillRestoreOpcode(unsigned Size)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillSaveOpcode(unsigned Size)
static bool resultDependsOnExec(const MachineInstr &MI)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:82
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static constexpr unsigned ModifierOpNames[]
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:73
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:807
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:744
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:748
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:998
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:390
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:626
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:764
bool hasMAIInsts() const
Definition: GCNSubtarget.h:814
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:277
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:297
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:760
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:679
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:752
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:343
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
Generation getGeneration() const
Definition: GCNSubtarget.h:316
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:923
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:731
bool hasAddr64() const
Definition: GCNSubtarget.h:380
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:723
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:537
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:607
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:617
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:193
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:444
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:438
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:248
unsigned short Opcode
Definition: MCInstrDesc.h:205
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:565
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:97
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:91
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:393
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:47
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineDomTreeNode * addNewBlock(MachineBasicBlock *BB, MachineBasicBlock *DomBB)
addNewBlock - Add a new node to the dominator tree information.
bool properlyDominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
void changeImmediateDominator(MachineBasicBlock *N, MachineBasicBlock *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:572
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
iterator_range< mop_iterator > explicit_operands()
Definition: MachineInstr.h:691
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:815
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:800
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:782
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:498
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:699
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:391
int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
bool addRegisterDead(Register Reg, const TargetRegisterInfo *RegInfo, bool AddIfNotFound=false)
We have determined MI defined a register without a use.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:792
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
bool isNonUniformBranchInstr(MachineInstr &Instr) const
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:504
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:924
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1144
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1272
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:408
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:496
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:604
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:520
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
bool isSpill(uint16_t Opcode) const
Definition: SIInstrInfo.h:740
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:424
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:464
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:957
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:760
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void materializeImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, int64_t Value) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:716
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:1003
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description.
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
void convertNonUniformIfRegion(MachineBasicBlock *IfEntry, MachineBasicBlock *IfEnd) const
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:681
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:863
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:728
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
bool isBarrier(unsigned Opcode) const
Definition: SIInstrInfo.h:939
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1285
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:880
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:562
unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:63
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:65
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:237
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1563
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isDPALU_DPP(const MCInstrDesc &OpDesc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1564
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1566
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isHi(unsigned Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2INT32
Definition: SIDefines.h:227
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_AC_V2FP16
Definition: SIDefines.h:246
@ OPERAND_REG_IMM_INT32
Operands with register or 32-bit immediate.
Definition: SIDefines.h:200
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16_DEFERRED
Definition: SIDefines.h:207
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:206
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_AC_BF16
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_AC_INT16
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:204
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_AC_FP16
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_AC_INT32
Definition: SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:242
@ OPERAND_REG_INLINE_AC_V2BF16
Definition: SIDefines.h:245
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:213
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:203
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:251
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:243
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:202
@ OPERAND_REG_INLINE_C_V2FP32
Definition: SIDefines.h:228
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_FP32_DEFERRED
Definition: SIDefines.h:209
@ OPERAND_REG_IMM_FP16_DEFERRED
Definition: SIDefines.h:208
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:409
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:411
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:408
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:410
@ TI_CONSTDATA_START
Definition: AMDGPU.h:407
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1565
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:70
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:68
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:67
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:69
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:60
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:59
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:66
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:71
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1454
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:547
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:296
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:260
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Error
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:219
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:235
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:83
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:49
MachineInstr * top() const
Definition: SIInstrInfo.h:54
bool empty() const
Definition: SIInstrInfo.h:64
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:73
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.