LLVM 20.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm::AMDGPU {
44#define GET_D16ImageDimIntrinsics_IMPL
45#define GET_ImageDimIntrinsicTable_IMPL
46#define GET_RsrcIntrinsics_IMPL
47#include "AMDGPUGenSearchableTables.inc"
48} // namespace llvm::AMDGPU
49
50// Must be at least 4 to be able to branch over minimum unconditional branch
51// code. This is only for making it possible to write reasonably small tests for
52// long branches.
54BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
55 cl::desc("Restrict range of branch instructions (DEBUG)"));
56
58 "amdgpu-fix-16-bit-physreg-copies",
59 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
60 cl::init(true),
62
64 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
65 RI(ST), ST(ST) {
66 SchedModel.init(&ST);
67}
68
69//===----------------------------------------------------------------------===//
70// TargetInstrInfo callbacks
71//===----------------------------------------------------------------------===//
72
73static unsigned getNumOperandsNoGlue(SDNode *Node) {
74 unsigned N = Node->getNumOperands();
75 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
76 --N;
77 return N;
78}
79
80/// Returns true if both nodes have the same value for the given
81/// operand \p Op, or if both nodes do not have this operand.
82static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
83 unsigned Opc0 = N0->getMachineOpcode();
84 unsigned Opc1 = N1->getMachineOpcode();
85
86 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
87 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
88
89 if (Op0Idx == -1 && Op1Idx == -1)
90 return true;
91
92
93 if ((Op0Idx == -1 && Op1Idx != -1) ||
94 (Op1Idx == -1 && Op0Idx != -1))
95 return false;
96
97 // getNamedOperandIdx returns the index for the MachineInstr's operands,
98 // which includes the result as the first operand. We are indexing into the
99 // MachineSDNode's operands, so we need to skip the result operand to get
100 // the real index.
101 --Op0Idx;
102 --Op1Idx;
103
104 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
105}
106
107static bool canRemat(const MachineInstr &MI) {
108
112 return true;
113
114 if (SIInstrInfo::isSMRD(MI)) {
115 return !MI.memoperands_empty() &&
116 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
117 return MMO->isLoad() && MMO->isInvariant();
118 });
119 }
120
121 return false;
122}
123
125 const MachineInstr &MI) const {
126
127 if (canRemat(MI)) {
128 // Normally VALU use of exec would block the rematerialization, but that
129 // is OK in this case to have an implicit exec read as all VALU do.
130 // We really want all of the generic logic for this except for this.
131
132 // Another potential implicit use is mode register. The core logic of
133 // the RA will not attempt rematerialization if mode is set anywhere
134 // in the function, otherwise it is safe since mode is not changed.
135
136 // There is difference to generic method which does not allow
137 // rematerialization if there are virtual register uses. We allow this,
138 // therefore this method includes SOP instructions as well.
139 if (!MI.hasImplicitDef() &&
140 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
141 !MI.mayRaiseFPException())
142 return true;
143 }
144
146}
147
148// Returns true if the scalar result of a VALU instruction depends on exec.
150 // Ignore comparisons which are only used masked with exec.
151 // This allows some hoisting/sinking of VALU comparisons.
152 if (MI.isCompare()) {
153 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
154 Register DstReg = MI.getOperand(0).getReg();
155 if (!DstReg.isVirtual())
156 return true;
157 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
158 switch (Use.getOpcode()) {
159 case AMDGPU::S_AND_SAVEEXEC_B32:
160 case AMDGPU::S_AND_SAVEEXEC_B64:
161 break;
162 case AMDGPU::S_AND_B32:
163 case AMDGPU::S_AND_B64:
164 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
165 return true;
166 break;
167 default:
168 return true;
169 }
170 }
171 return false;
172 }
173
174 switch (MI.getOpcode()) {
175 default:
176 break;
177 case AMDGPU::V_READFIRSTLANE_B32:
178 return true;
179 }
180
181 return false;
182}
183
185 // Any implicit use of exec by VALU is not a real register read.
186 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
188}
189
191 MachineBasicBlock *SuccToSinkTo,
192 MachineCycleInfo *CI) const {
193 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
194 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
195 return true;
196
197 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
198 // Check if sinking of MI would create temporal divergent use.
199 for (auto Op : MI.uses()) {
200 if (Op.isReg() && Op.getReg().isVirtual() &&
201 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
202 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
203
204 // SgprDef defined inside cycle
205 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
206 if (FromCycle == nullptr)
207 continue;
208
209 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
210 // Check if there is a FromCycle that contains SgprDef's basic block but
211 // does not contain SuccToSinkTo and also has divergent exit condition.
212 while (FromCycle && !FromCycle->contains(ToCycle)) {
214 FromCycle->getExitingBlocks(ExitingBlocks);
215
216 // FromCycle has divergent exit condition.
217 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
218 if (hasDivergentBranch(ExitingBlock))
219 return false;
220 }
221
222 FromCycle = FromCycle->getParentCycle();
223 }
224 }
225 }
226
227 return true;
228}
229
231 int64_t &Offset0,
232 int64_t &Offset1) const {
233 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
234 return false;
235
236 unsigned Opc0 = Load0->getMachineOpcode();
237 unsigned Opc1 = Load1->getMachineOpcode();
238
239 // Make sure both are actually loads.
240 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
241 return false;
242
243 // A mayLoad instruction without a def is not a load. Likely a prefetch.
244 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
245 return false;
246
247 if (isDS(Opc0) && isDS(Opc1)) {
248
249 // FIXME: Handle this case:
250 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
251 return false;
252
253 // Check base reg.
254 if (Load0->getOperand(0) != Load1->getOperand(0))
255 return false;
256
257 // Skip read2 / write2 variants for simplicity.
258 // TODO: We should report true if the used offsets are adjacent (excluded
259 // st64 versions).
260 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
261 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
262 if (Offset0Idx == -1 || Offset1Idx == -1)
263 return false;
264
265 // XXX - be careful of dataless loads
266 // getNamedOperandIdx returns the index for MachineInstrs. Since they
267 // include the output in the operand list, but SDNodes don't, we need to
268 // subtract the index by one.
269 Offset0Idx -= get(Opc0).NumDefs;
270 Offset1Idx -= get(Opc1).NumDefs;
271 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
272 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
273 return true;
274 }
275
276 if (isSMRD(Opc0) && isSMRD(Opc1)) {
277 // Skip time and cache invalidation instructions.
278 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
279 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
280 return false;
281
282 unsigned NumOps = getNumOperandsNoGlue(Load0);
283 if (NumOps != getNumOperandsNoGlue(Load1))
284 return false;
285
286 // Check base reg.
287 if (Load0->getOperand(0) != Load1->getOperand(0))
288 return false;
289
290 // Match register offsets, if both register and immediate offsets present.
291 assert(NumOps == 4 || NumOps == 5);
292 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
293 return false;
294
295 const ConstantSDNode *Load0Offset =
296 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
297 const ConstantSDNode *Load1Offset =
298 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
299
300 if (!Load0Offset || !Load1Offset)
301 return false;
302
303 Offset0 = Load0Offset->getZExtValue();
304 Offset1 = Load1Offset->getZExtValue();
305 return true;
306 }
307
308 // MUBUF and MTBUF can access the same addresses.
309 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
310
311 // MUBUF and MTBUF have vaddr at different indices.
312 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
313 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
314 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
315 return false;
316
317 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
318 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
319
320 if (OffIdx0 == -1 || OffIdx1 == -1)
321 return false;
322
323 // getNamedOperandIdx returns the index for MachineInstrs. Since they
324 // include the output in the operand list, but SDNodes don't, we need to
325 // subtract the index by one.
326 OffIdx0 -= get(Opc0).NumDefs;
327 OffIdx1 -= get(Opc1).NumDefs;
328
329 SDValue Off0 = Load0->getOperand(OffIdx0);
330 SDValue Off1 = Load1->getOperand(OffIdx1);
331
332 // The offset might be a FrameIndexSDNode.
333 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
334 return false;
335
336 Offset0 = Off0->getAsZExtVal();
337 Offset1 = Off1->getAsZExtVal();
338 return true;
339 }
340
341 return false;
342}
343
344static bool isStride64(unsigned Opc) {
345 switch (Opc) {
346 case AMDGPU::DS_READ2ST64_B32:
347 case AMDGPU::DS_READ2ST64_B64:
348 case AMDGPU::DS_WRITE2ST64_B32:
349 case AMDGPU::DS_WRITE2ST64_B64:
350 return true;
351 default:
352 return false;
353 }
354}
355
358 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
359 const TargetRegisterInfo *TRI) const {
360 if (!LdSt.mayLoadOrStore())
361 return false;
362
363 unsigned Opc = LdSt.getOpcode();
364 OffsetIsScalable = false;
365 const MachineOperand *BaseOp, *OffsetOp;
366 int DataOpIdx;
367
368 if (isDS(LdSt)) {
369 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
370 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
371 if (OffsetOp) {
372 // Normal, single offset LDS instruction.
373 if (!BaseOp) {
374 // DS_CONSUME/DS_APPEND use M0 for the base address.
375 // TODO: find the implicit use operand for M0 and use that as BaseOp?
376 return false;
377 }
378 BaseOps.push_back(BaseOp);
379 Offset = OffsetOp->getImm();
380 // Get appropriate operand, and compute width accordingly.
381 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
382 if (DataOpIdx == -1)
383 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
384 Width = getOpSize(LdSt, DataOpIdx);
385 } else {
386 // The 2 offset instructions use offset0 and offset1 instead. We can treat
387 // these as a load with a single offset if the 2 offsets are consecutive.
388 // We will use this for some partially aligned loads.
389 const MachineOperand *Offset0Op =
390 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
391 const MachineOperand *Offset1Op =
392 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
393
394 unsigned Offset0 = Offset0Op->getImm() & 0xff;
395 unsigned Offset1 = Offset1Op->getImm() & 0xff;
396 if (Offset0 + 1 != Offset1)
397 return false;
398
399 // Each of these offsets is in element sized units, so we need to convert
400 // to bytes of the individual reads.
401
402 unsigned EltSize;
403 if (LdSt.mayLoad())
404 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
405 else {
406 assert(LdSt.mayStore());
407 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
408 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
409 }
410
411 if (isStride64(Opc))
412 EltSize *= 64;
413
414 BaseOps.push_back(BaseOp);
415 Offset = EltSize * Offset0;
416 // Get appropriate operand(s), and compute width accordingly.
417 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
418 if (DataOpIdx == -1) {
419 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 Width = getOpSize(LdSt, DataOpIdx);
421 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
422 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
423 } else {
424 Width = getOpSize(LdSt, DataOpIdx);
425 }
426 }
427 return true;
428 }
429
430 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
431 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
432 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
433 return false;
434 BaseOps.push_back(RSrc);
435 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
436 if (BaseOp && !BaseOp->isFI())
437 BaseOps.push_back(BaseOp);
438 const MachineOperand *OffsetImm =
439 getNamedOperand(LdSt, AMDGPU::OpName::offset);
440 Offset = OffsetImm->getImm();
441 const MachineOperand *SOffset =
442 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
443 if (SOffset) {
444 if (SOffset->isReg())
445 BaseOps.push_back(SOffset);
446 else
447 Offset += SOffset->getImm();
448 }
449 // Get appropriate operand, and compute width accordingly.
450 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
451 if (DataOpIdx == -1)
452 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
453 if (DataOpIdx == -1) // LDS DMA
454 return false;
455 Width = getOpSize(LdSt, DataOpIdx);
456 return true;
457 }
458
459 if (isImage(LdSt)) {
460 auto RsrcOpName =
461 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
462 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
463 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
464 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
465 if (VAddr0Idx >= 0) {
466 // GFX10 possible NSA encoding.
467 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
468 BaseOps.push_back(&LdSt.getOperand(I));
469 } else {
470 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
471 }
472 Offset = 0;
473 // Get appropriate operand, and compute width accordingly.
474 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
475 if (DataOpIdx == -1)
476 return false; // no return sampler
477 Width = getOpSize(LdSt, DataOpIdx);
478 return true;
479 }
480
481 if (isSMRD(LdSt)) {
482 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
483 if (!BaseOp) // e.g. S_MEMTIME
484 return false;
485 BaseOps.push_back(BaseOp);
486 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
487 Offset = OffsetOp ? OffsetOp->getImm() : 0;
488 // Get appropriate operand, and compute width accordingly.
489 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
490 if (DataOpIdx == -1)
491 return false;
492 Width = getOpSize(LdSt, DataOpIdx);
493 return true;
494 }
495
496 if (isFLAT(LdSt)) {
497 // Instructions have either vaddr or saddr or both or none.
498 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
499 if (BaseOp)
500 BaseOps.push_back(BaseOp);
501 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
502 if (BaseOp)
503 BaseOps.push_back(BaseOp);
504 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
505 // Get appropriate operand, and compute width accordingly.
506 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
507 if (DataOpIdx == -1)
508 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
509 if (DataOpIdx == -1) // LDS DMA
510 return false;
511 Width = getOpSize(LdSt, DataOpIdx);
512 return true;
513 }
514
515 return false;
516}
517
518static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
520 const MachineInstr &MI2,
522 // Only examine the first "base" operand of each instruction, on the
523 // assumption that it represents the real base address of the memory access.
524 // Other operands are typically offsets or indices from this base address.
525 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
526 return true;
527
528 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
529 return false;
530
531 auto *MO1 = *MI1.memoperands_begin();
532 auto *MO2 = *MI2.memoperands_begin();
533 if (MO1->getAddrSpace() != MO2->getAddrSpace())
534 return false;
535
536 const auto *Base1 = MO1->getValue();
537 const auto *Base2 = MO2->getValue();
538 if (!Base1 || !Base2)
539 return false;
540 Base1 = getUnderlyingObject(Base1);
541 Base2 = getUnderlyingObject(Base2);
542
543 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
544 return false;
545
546 return Base1 == Base2;
547}
548
550 int64_t Offset1, bool OffsetIsScalable1,
552 int64_t Offset2, bool OffsetIsScalable2,
553 unsigned ClusterSize,
554 unsigned NumBytes) const {
555 // If the mem ops (to be clustered) do not have the same base ptr, then they
556 // should not be clustered
557 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
558 if (!BaseOps1.empty() && !BaseOps2.empty()) {
559 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
560 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
561 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
562 return false;
563
564 const SIMachineFunctionInfo *MFI =
565 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
566 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
567 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
568 // If only one base op is empty, they do not have the same base ptr
569 return false;
570 }
571
572 // In order to avoid register pressure, on an average, the number of DWORDS
573 // loaded together by all clustered mem ops should not exceed
574 // MaxMemoryClusterDWords. This is an empirical value based on certain
575 // observations and performance related experiments.
576 // The good thing about this heuristic is - it avoids clustering of too many
577 // sub-word loads, and also avoids clustering of wide loads. Below is the
578 // brief summary of how the heuristic behaves for various `LoadSize` when
579 // MaxMemoryClusterDWords is 8.
580 //
581 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
582 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
583 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
584 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
585 // (5) LoadSize >= 17: do not cluster
586 const unsigned LoadSize = NumBytes / ClusterSize;
587 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
588 return NumDWords <= MaxMemoryClusterDWords;
589}
590
591// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
592// the first 16 loads will be interleaved with the stores, and the next 16 will
593// be clustered as expected. It should really split into 2 16 store batches.
594//
595// Loads are clustered until this returns false, rather than trying to schedule
596// groups of stores. This also means we have to deal with saying different
597// address space loads should be clustered, and ones which might cause bank
598// conflicts.
599//
600// This might be deprecated so it might not be worth that much effort to fix.
602 int64_t Offset0, int64_t Offset1,
603 unsigned NumLoads) const {
604 assert(Offset1 > Offset0 &&
605 "Second offset should be larger than first offset!");
606 // If we have less than 16 loads in a row, and the offsets are within 64
607 // bytes, then schedule together.
608
609 // A cacheline is 64 bytes (for global memory).
610 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
611}
612
615 const DebugLoc &DL, MCRegister DestReg,
616 MCRegister SrcReg, bool KillSrc,
617 const char *Msg = "illegal VGPR to SGPR copy") {
619 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
621 C.diagnose(IllegalCopy);
622
623 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
624 .addReg(SrcReg, getKillRegState(KillSrc));
625}
626
627/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
628/// possible to have a direct copy in these cases on GFX908, so an intermediate
629/// VGPR copy is required.
633 const DebugLoc &DL, MCRegister DestReg,
634 MCRegister SrcReg, bool KillSrc,
635 RegScavenger &RS, bool RegsOverlap,
636 Register ImpDefSuperReg = Register(),
637 Register ImpUseSuperReg = Register()) {
638 assert((TII.getSubtarget().hasMAIInsts() &&
639 !TII.getSubtarget().hasGFX90AInsts()) &&
640 "Expected GFX908 subtarget.");
641
642 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
643 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
644 "Source register of the copy should be either an SGPR or an AGPR.");
645
646 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
647 "Destination register of the copy should be an AGPR.");
648
649 const SIRegisterInfo &RI = TII.getRegisterInfo();
650
651 // First try to find defining accvgpr_write to avoid temporary registers.
652 // In the case of copies of overlapping AGPRs, we conservatively do not
653 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
654 // an accvgpr_write used for this same copy due to implicit-defs
655 if (!RegsOverlap) {
656 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
657 --Def;
658
659 if (!Def->modifiesRegister(SrcReg, &RI))
660 continue;
661
662 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
663 Def->getOperand(0).getReg() != SrcReg)
664 break;
665
666 MachineOperand &DefOp = Def->getOperand(1);
667 assert(DefOp.isReg() || DefOp.isImm());
668
669 if (DefOp.isReg()) {
670 bool SafeToPropagate = true;
671 // Check that register source operand is not clobbered before MI.
672 // Immediate operands are always safe to propagate.
673 for (auto I = Def; I != MI && SafeToPropagate; ++I)
674 if (I->modifiesRegister(DefOp.getReg(), &RI))
675 SafeToPropagate = false;
676
677 if (!SafeToPropagate)
678 break;
679
680 DefOp.setIsKill(false);
681 }
682
683 MachineInstrBuilder Builder =
684 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
685 .add(DefOp);
686 if (ImpDefSuperReg)
687 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
688
689 if (ImpUseSuperReg) {
690 Builder.addReg(ImpUseSuperReg,
692 }
693
694 return;
695 }
696 }
697
699 RS.backward(std::next(MI));
700
701 // Ideally we want to have three registers for a long reg_sequence copy
702 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
703 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
704 *MBB.getParent());
705
706 // Registers in the sequence are allocated contiguously so we can just
707 // use register number to pick one of three round-robin temps.
708 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
709 Register Tmp =
710 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
712 "VGPR used for an intermediate copy should have been reserved.");
713
714 // Only loop through if there are any free registers left. We don't want to
715 // spill.
716 while (RegNo--) {
717 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
718 /* RestoreAfter */ false, 0,
719 /* AllowSpill */ false);
720 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
721 break;
722 Tmp = Tmp2;
723 RS.setRegUsed(Tmp);
724 }
725
726 // Insert copy to temporary VGPR.
727 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
728 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
729 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
730 } else {
731 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
732 }
733
734 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
735 .addReg(SrcReg, getKillRegState(KillSrc));
736 if (ImpUseSuperReg) {
737 UseBuilder.addReg(ImpUseSuperReg,
739 }
740
741 MachineInstrBuilder DefBuilder
742 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
743 .addReg(Tmp, RegState::Kill);
744
745 if (ImpDefSuperReg)
746 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
747}
748
751 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
752 const TargetRegisterClass *RC, bool Forward) {
753 const SIRegisterInfo &RI = TII.getRegisterInfo();
754 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
756 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
757
758 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
759 int16_t SubIdx = BaseIndices[Idx];
760 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
761 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
762 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
763 unsigned Opcode = AMDGPU::S_MOV_B32;
764
765 // Is SGPR aligned? If so try to combine with next.
766 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
767 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
768 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
769 // Can use SGPR64 copy
770 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
771 SubIdx = RI.getSubRegFromChannel(Channel, 2);
772 DestSubReg = RI.getSubReg(DestReg, SubIdx);
773 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
774 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
775 Opcode = AMDGPU::S_MOV_B64;
776 Idx++;
777 }
778
779 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
780 .addReg(SrcSubReg)
781 .addReg(SrcReg, RegState::Implicit);
782
783 if (!FirstMI)
784 FirstMI = LastMI;
785
786 if (!Forward)
787 I--;
788 }
789
790 assert(FirstMI && LastMI);
791 if (!Forward)
792 std::swap(FirstMI, LastMI);
793
794 FirstMI->addOperand(
795 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
796
797 if (KillSrc)
798 LastMI->addRegisterKilled(SrcReg, &RI);
799}
800
803 const DebugLoc &DL, MCRegister DestReg,
804 MCRegister SrcReg, bool KillSrc,
805 bool RenamableDest, bool RenamableSrc) const {
806 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
807 unsigned Size = RI.getRegSizeInBits(*RC);
808 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
809 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
810
811 // The rest of copyPhysReg assumes Src and Dst size are the same size.
812 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
813 // we remove Fix16BitCopies and this code block?
814 if (Fix16BitCopies) {
815 if (((Size == 16) != (SrcSize == 16))) {
816 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
818 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
819 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
820 RegToFix = SubReg;
821
822 if (DestReg == SrcReg) {
823 // Identity copy. Insert empty bundle since ExpandPostRA expects an
824 // instruction here.
825 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
826 return;
827 }
828 RC = RI.getPhysRegBaseClass(DestReg);
829 Size = RI.getRegSizeInBits(*RC);
830 SrcRC = RI.getPhysRegBaseClass(SrcReg);
831 SrcSize = RI.getRegSizeInBits(*SrcRC);
832 }
833 }
834
835 if (RC == &AMDGPU::VGPR_32RegClass) {
836 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
837 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
838 AMDGPU::AGPR_32RegClass.contains(SrcReg));
839 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
840 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
841 BuildMI(MBB, MI, DL, get(Opc), DestReg)
842 .addReg(SrcReg, getKillRegState(KillSrc));
843 return;
844 }
845
846 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
847 RC == &AMDGPU::SReg_32RegClass) {
848 if (SrcReg == AMDGPU::SCC) {
849 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
850 .addImm(1)
851 .addImm(0);
852 return;
853 }
854
855 if (DestReg == AMDGPU::VCC_LO) {
856 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
857 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
858 .addReg(SrcReg, getKillRegState(KillSrc));
859 } else {
860 // FIXME: Hack until VReg_1 removed.
861 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
862 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
863 .addImm(0)
864 .addReg(SrcReg, getKillRegState(KillSrc));
865 }
866
867 return;
868 }
869
870 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
871 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
872 return;
873 }
874
875 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
876 .addReg(SrcReg, getKillRegState(KillSrc));
877 return;
878 }
879
880 if (RC == &AMDGPU::SReg_64RegClass) {
881 if (SrcReg == AMDGPU::SCC) {
882 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
883 .addImm(1)
884 .addImm(0);
885 return;
886 }
887
888 if (DestReg == AMDGPU::VCC) {
889 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
890 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
891 .addReg(SrcReg, getKillRegState(KillSrc));
892 } else {
893 // FIXME: Hack until VReg_1 removed.
894 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
895 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
896 .addImm(0)
897 .addReg(SrcReg, getKillRegState(KillSrc));
898 }
899
900 return;
901 }
902
903 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
904 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
905 return;
906 }
907
908 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
909 .addReg(SrcReg, getKillRegState(KillSrc));
910 return;
911 }
912
913 if (DestReg == AMDGPU::SCC) {
914 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
915 // but SelectionDAG emits such copies for i1 sources.
916 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
917 // This copy can only be produced by patterns
918 // with explicit SCC, which are known to be enabled
919 // only for subtargets with S_CMP_LG_U64 present.
921 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
922 .addReg(SrcReg, getKillRegState(KillSrc))
923 .addImm(0);
924 } else {
925 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
926 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
927 .addReg(SrcReg, getKillRegState(KillSrc))
928 .addImm(0);
929 }
930
931 return;
932 }
933
934 if (RC == &AMDGPU::AGPR_32RegClass) {
935 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
936 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
937 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
938 .addReg(SrcReg, getKillRegState(KillSrc));
939 return;
940 }
941
942 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
943 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
944 .addReg(SrcReg, getKillRegState(KillSrc));
945 return;
946 }
947
948 // FIXME: Pass should maintain scavenger to avoid scan through the block on
949 // every AGPR spill.
950 RegScavenger RS;
951 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
952 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
953 return;
954 }
955
956 if (Size == 16) {
957 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
958 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
959 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
960
961 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
962 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
963 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
964 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
965 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
966 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
967 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
968 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
969
970 if (IsSGPRDst) {
971 if (!IsSGPRSrc) {
972 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
973 return;
974 }
975
976 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
977 .addReg(NewSrcReg, getKillRegState(KillSrc));
978 return;
979 }
980
981 if (IsAGPRDst || IsAGPRSrc) {
982 if (!DstLow || !SrcLow) {
983 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
984 "Cannot use hi16 subreg with an AGPR!");
985 }
986
987 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
988 return;
989 }
990
991 if (ST.hasTrue16BitInsts()) {
992 if (IsSGPRSrc) {
993 assert(SrcLow);
994 SrcReg = NewSrcReg;
995 }
996 // Use the smaller instruction encoding if possible.
997 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
998 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
999 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1000 .addReg(SrcReg);
1001 } else {
1002 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1003 .addImm(0) // src0_modifiers
1004 .addReg(SrcReg)
1005 .addImm(0); // op_sel
1006 }
1007 return;
1008 }
1009
1010 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1011 if (!DstLow || !SrcLow) {
1012 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1013 "Cannot use hi16 subreg on VI!");
1014 }
1015
1016 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1017 .addReg(NewSrcReg, getKillRegState(KillSrc));
1018 return;
1019 }
1020
1021 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1022 .addImm(0) // src0_modifiers
1023 .addReg(NewSrcReg)
1024 .addImm(0) // clamp
1031 // First implicit operand is $exec.
1032 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1033 return;
1034 }
1035
1036 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1037 if (ST.hasMovB64()) {
1038 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1039 .addReg(SrcReg, getKillRegState(KillSrc));
1040 return;
1041 }
1042 if (ST.hasPkMovB32()) {
1043 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1045 .addReg(SrcReg)
1047 .addReg(SrcReg)
1048 .addImm(0) // op_sel_lo
1049 .addImm(0) // op_sel_hi
1050 .addImm(0) // neg_lo
1051 .addImm(0) // neg_hi
1052 .addImm(0) // clamp
1053 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1054 return;
1055 }
1056 }
1057
1058 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1059 if (RI.isSGPRClass(RC)) {
1060 if (!RI.isSGPRClass(SrcRC)) {
1061 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1062 return;
1063 }
1064 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1065 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1066 Forward);
1067 return;
1068 }
1069
1070 unsigned EltSize = 4;
1071 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1072 if (RI.isAGPRClass(RC)) {
1073 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1074 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1075 else if (RI.hasVGPRs(SrcRC) ||
1076 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1077 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1078 else
1079 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1080 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1081 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1082 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1083 (RI.isProperlyAlignedRC(*RC) &&
1084 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1085 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1086 if (ST.hasMovB64()) {
1087 Opcode = AMDGPU::V_MOV_B64_e32;
1088 EltSize = 8;
1089 } else if (ST.hasPkMovB32()) {
1090 Opcode = AMDGPU::V_PK_MOV_B32;
1091 EltSize = 8;
1092 }
1093 }
1094
1095 // For the cases where we need an intermediate instruction/temporary register
1096 // (destination is an AGPR), we need a scavenger.
1097 //
1098 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1099 // whole block for every handled copy.
1100 std::unique_ptr<RegScavenger> RS;
1101 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1102 RS = std::make_unique<RegScavenger>();
1103
1104 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1105
1106 // If there is an overlap, we can't kill the super-register on the last
1107 // instruction, since it will also kill the components made live by this def.
1108 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1109 const bool CanKillSuperReg = KillSrc && !Overlap;
1110
1111 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1112 unsigned SubIdx;
1113 if (Forward)
1114 SubIdx = SubIndices[Idx];
1115 else
1116 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1117 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1118 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1119 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1120
1121 bool IsFirstSubreg = Idx == 0;
1122 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1123
1124 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1125 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1126 Register ImpUseSuper = SrcReg;
1127 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1128 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1129 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1131 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1133 .addReg(SrcSubReg)
1135 .addReg(SrcSubReg)
1136 .addImm(0) // op_sel_lo
1137 .addImm(0) // op_sel_hi
1138 .addImm(0) // neg_lo
1139 .addImm(0) // neg_hi
1140 .addImm(0) // clamp
1141 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1142 if (IsFirstSubreg)
1144 } else {
1145 MachineInstrBuilder Builder =
1146 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1147 if (IsFirstSubreg)
1148 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1149
1150 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1151 }
1152 }
1153}
1154
1155int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1156 int NewOpc;
1157
1158 // Try to map original to commuted opcode
1159 NewOpc = AMDGPU::getCommuteRev(Opcode);
1160 if (NewOpc != -1)
1161 // Check if the commuted (REV) opcode exists on the target.
1162 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1163
1164 // Try to map commuted to original opcode
1165 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1166 if (NewOpc != -1)
1167 // Check if the original (non-REV) opcode exists on the target.
1168 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1169
1170 return Opcode;
1171}
1172
1175 const DebugLoc &DL, Register DestReg,
1176 int64_t Value) const {
1178 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1179 if (RegClass == &AMDGPU::SReg_32RegClass ||
1180 RegClass == &AMDGPU::SGPR_32RegClass ||
1181 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1182 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1183 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1184 .addImm(Value);
1185 return;
1186 }
1187
1188 if (RegClass == &AMDGPU::SReg_64RegClass ||
1189 RegClass == &AMDGPU::SGPR_64RegClass ||
1190 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1191 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1192 .addImm(Value);
1193 return;
1194 }
1195
1196 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1197 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1198 .addImm(Value);
1199 return;
1200 }
1201 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1202 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1203 .addImm(Value);
1204 return;
1205 }
1206
1207 unsigned EltSize = 4;
1208 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1209 if (RI.isSGPRClass(RegClass)) {
1210 if (RI.getRegSizeInBits(*RegClass) > 32) {
1211 Opcode = AMDGPU::S_MOV_B64;
1212 EltSize = 8;
1213 } else {
1214 Opcode = AMDGPU::S_MOV_B32;
1215 EltSize = 4;
1216 }
1217 }
1218
1219 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1220 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1221 int64_t IdxValue = Idx == 0 ? Value : 0;
1222
1223 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1224 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1225 Builder.addImm(IdxValue);
1226 }
1227}
1228
1229const TargetRegisterClass *
1231 return &AMDGPU::VGPR_32RegClass;
1232}
1233
1236 const DebugLoc &DL, Register DstReg,
1238 Register TrueReg,
1239 Register FalseReg) const {
1241 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1242 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1243 "Not a VGPR32 reg");
1244
1245 if (Cond.size() == 1) {
1246 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1247 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1248 .add(Cond[0]);
1249 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1250 .addImm(0)
1251 .addReg(FalseReg)
1252 .addImm(0)
1253 .addReg(TrueReg)
1254 .addReg(SReg);
1255 } else if (Cond.size() == 2) {
1256 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1257 switch (Cond[0].getImm()) {
1258 case SIInstrInfo::SCC_TRUE: {
1259 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1260 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1261 : AMDGPU::S_CSELECT_B64), SReg)
1262 .addImm(1)
1263 .addImm(0);
1264 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1265 .addImm(0)
1266 .addReg(FalseReg)
1267 .addImm(0)
1268 .addReg(TrueReg)
1269 .addReg(SReg);
1270 break;
1271 }
1272 case SIInstrInfo::SCC_FALSE: {
1273 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1274 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1275 : AMDGPU::S_CSELECT_B64), SReg)
1276 .addImm(0)
1277 .addImm(1);
1278 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1279 .addImm(0)
1280 .addReg(FalseReg)
1281 .addImm(0)
1282 .addReg(TrueReg)
1283 .addReg(SReg);
1284 break;
1285 }
1286 case SIInstrInfo::VCCNZ: {
1287 MachineOperand RegOp = Cond[1];
1288 RegOp.setImplicit(false);
1289 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1290 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1291 .add(RegOp);
1292 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1293 .addImm(0)
1294 .addReg(FalseReg)
1295 .addImm(0)
1296 .addReg(TrueReg)
1297 .addReg(SReg);
1298 break;
1299 }
1300 case SIInstrInfo::VCCZ: {
1301 MachineOperand RegOp = Cond[1];
1302 RegOp.setImplicit(false);
1303 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1304 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1305 .add(RegOp);
1306 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1307 .addImm(0)
1308 .addReg(TrueReg)
1309 .addImm(0)
1310 .addReg(FalseReg)
1311 .addReg(SReg);
1312 break;
1313 }
1314 case SIInstrInfo::EXECNZ: {
1315 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1316 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1317 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1318 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1319 .addImm(0);
1320 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1321 : AMDGPU::S_CSELECT_B64), SReg)
1322 .addImm(1)
1323 .addImm(0);
1324 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1325 .addImm(0)
1326 .addReg(FalseReg)
1327 .addImm(0)
1328 .addReg(TrueReg)
1329 .addReg(SReg);
1330 break;
1331 }
1332 case SIInstrInfo::EXECZ: {
1333 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1334 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1335 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1336 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1337 .addImm(0);
1338 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1339 : AMDGPU::S_CSELECT_B64), SReg)
1340 .addImm(0)
1341 .addImm(1);
1342 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1343 .addImm(0)
1344 .addReg(FalseReg)
1345 .addImm(0)
1346 .addReg(TrueReg)
1347 .addReg(SReg);
1348 llvm_unreachable("Unhandled branch predicate EXECZ");
1349 break;
1350 }
1351 default:
1352 llvm_unreachable("invalid branch predicate");
1353 }
1354 } else {
1355 llvm_unreachable("Can only handle Cond size 1 or 2");
1356 }
1357}
1358
1361 const DebugLoc &DL,
1362 Register SrcReg, int Value) const {
1364 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1365 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1366 .addImm(Value)
1367 .addReg(SrcReg);
1368
1369 return Reg;
1370}
1371
1374 const DebugLoc &DL,
1375 Register SrcReg, int Value) const {
1377 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1378 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1379 .addImm(Value)
1380 .addReg(SrcReg);
1381
1382 return Reg;
1383}
1384
1386
1387 if (RI.isAGPRClass(DstRC))
1388 return AMDGPU::COPY;
1389 if (RI.getRegSizeInBits(*DstRC) == 16) {
1390 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1391 // before RA.
1392 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1393 }
1394 if (RI.getRegSizeInBits(*DstRC) == 32)
1395 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1396 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1397 return AMDGPU::S_MOV_B64;
1398 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1399 return AMDGPU::V_MOV_B64_PSEUDO;
1400 return AMDGPU::COPY;
1401}
1402
1403const MCInstrDesc &
1405 bool IsIndirectSrc) const {
1406 if (IsIndirectSrc) {
1407 if (VecSize <= 32) // 4 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1409 if (VecSize <= 64) // 8 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1411 if (VecSize <= 96) // 12 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1413 if (VecSize <= 128) // 16 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1415 if (VecSize <= 160) // 20 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1417 if (VecSize <= 256) // 32 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1419 if (VecSize <= 288) // 36 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1421 if (VecSize <= 320) // 40 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1423 if (VecSize <= 352) // 44 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1425 if (VecSize <= 384) // 48 bytes
1426 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1427 if (VecSize <= 512) // 64 bytes
1428 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1429 if (VecSize <= 1024) // 128 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1431
1432 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1433 }
1434
1435 if (VecSize <= 32) // 4 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1437 if (VecSize <= 64) // 8 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1439 if (VecSize <= 96) // 12 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1441 if (VecSize <= 128) // 16 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1443 if (VecSize <= 160) // 20 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1445 if (VecSize <= 256) // 32 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1447 if (VecSize <= 288) // 36 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1449 if (VecSize <= 320) // 40 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1451 if (VecSize <= 352) // 44 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1453 if (VecSize <= 384) // 48 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1455 if (VecSize <= 512) // 64 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1457 if (VecSize <= 1024) // 128 bytes
1458 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1459
1460 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1461}
1462
1463static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1464 if (VecSize <= 32) // 4 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1466 if (VecSize <= 64) // 8 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1468 if (VecSize <= 96) // 12 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1470 if (VecSize <= 128) // 16 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1472 if (VecSize <= 160) // 20 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1474 if (VecSize <= 256) // 32 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1476 if (VecSize <= 288) // 36 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1478 if (VecSize <= 320) // 40 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1480 if (VecSize <= 352) // 44 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1482 if (VecSize <= 384) // 48 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1484 if (VecSize <= 512) // 64 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1486 if (VecSize <= 1024) // 128 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1488
1489 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1490}
1491
1492static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1493 if (VecSize <= 32) // 4 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1495 if (VecSize <= 64) // 8 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1497 if (VecSize <= 96) // 12 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1499 if (VecSize <= 128) // 16 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1501 if (VecSize <= 160) // 20 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1503 if (VecSize <= 256) // 32 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1505 if (VecSize <= 288) // 36 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1507 if (VecSize <= 320) // 40 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1509 if (VecSize <= 352) // 44 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1511 if (VecSize <= 384) // 48 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1513 if (VecSize <= 512) // 64 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1515 if (VecSize <= 1024) // 128 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1517
1518 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1519}
1520
1521static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1522 if (VecSize <= 64) // 8 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1524 if (VecSize <= 128) // 16 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1526 if (VecSize <= 256) // 32 bytes
1527 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1528 if (VecSize <= 512) // 64 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1530 if (VecSize <= 1024) // 128 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1532
1533 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1534}
1535
1536const MCInstrDesc &
1537SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1538 bool IsSGPR) const {
1539 if (IsSGPR) {
1540 switch (EltSize) {
1541 case 32:
1542 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1543 case 64:
1544 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1545 default:
1546 llvm_unreachable("invalid reg indexing elt size");
1547 }
1548 }
1549
1550 assert(EltSize == 32 && "invalid reg indexing elt size");
1552}
1553
1554static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1555 switch (Size) {
1556 case 4:
1557 return AMDGPU::SI_SPILL_S32_SAVE;
1558 case 8:
1559 return AMDGPU::SI_SPILL_S64_SAVE;
1560 case 12:
1561 return AMDGPU::SI_SPILL_S96_SAVE;
1562 case 16:
1563 return AMDGPU::SI_SPILL_S128_SAVE;
1564 case 20:
1565 return AMDGPU::SI_SPILL_S160_SAVE;
1566 case 24:
1567 return AMDGPU::SI_SPILL_S192_SAVE;
1568 case 28:
1569 return AMDGPU::SI_SPILL_S224_SAVE;
1570 case 32:
1571 return AMDGPU::SI_SPILL_S256_SAVE;
1572 case 36:
1573 return AMDGPU::SI_SPILL_S288_SAVE;
1574 case 40:
1575 return AMDGPU::SI_SPILL_S320_SAVE;
1576 case 44:
1577 return AMDGPU::SI_SPILL_S352_SAVE;
1578 case 48:
1579 return AMDGPU::SI_SPILL_S384_SAVE;
1580 case 64:
1581 return AMDGPU::SI_SPILL_S512_SAVE;
1582 case 128:
1583 return AMDGPU::SI_SPILL_S1024_SAVE;
1584 default:
1585 llvm_unreachable("unknown register size");
1586 }
1587}
1588
1589static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1590 switch (Size) {
1591 case 4:
1592 return AMDGPU::SI_SPILL_V32_SAVE;
1593 case 8:
1594 return AMDGPU::SI_SPILL_V64_SAVE;
1595 case 12:
1596 return AMDGPU::SI_SPILL_V96_SAVE;
1597 case 16:
1598 return AMDGPU::SI_SPILL_V128_SAVE;
1599 case 20:
1600 return AMDGPU::SI_SPILL_V160_SAVE;
1601 case 24:
1602 return AMDGPU::SI_SPILL_V192_SAVE;
1603 case 28:
1604 return AMDGPU::SI_SPILL_V224_SAVE;
1605 case 32:
1606 return AMDGPU::SI_SPILL_V256_SAVE;
1607 case 36:
1608 return AMDGPU::SI_SPILL_V288_SAVE;
1609 case 40:
1610 return AMDGPU::SI_SPILL_V320_SAVE;
1611 case 44:
1612 return AMDGPU::SI_SPILL_V352_SAVE;
1613 case 48:
1614 return AMDGPU::SI_SPILL_V384_SAVE;
1615 case 64:
1616 return AMDGPU::SI_SPILL_V512_SAVE;
1617 case 128:
1618 return AMDGPU::SI_SPILL_V1024_SAVE;
1619 default:
1620 llvm_unreachable("unknown register size");
1621 }
1622}
1623
1624static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1625 switch (Size) {
1626 case 4:
1627 return AMDGPU::SI_SPILL_A32_SAVE;
1628 case 8:
1629 return AMDGPU::SI_SPILL_A64_SAVE;
1630 case 12:
1631 return AMDGPU::SI_SPILL_A96_SAVE;
1632 case 16:
1633 return AMDGPU::SI_SPILL_A128_SAVE;
1634 case 20:
1635 return AMDGPU::SI_SPILL_A160_SAVE;
1636 case 24:
1637 return AMDGPU::SI_SPILL_A192_SAVE;
1638 case 28:
1639 return AMDGPU::SI_SPILL_A224_SAVE;
1640 case 32:
1641 return AMDGPU::SI_SPILL_A256_SAVE;
1642 case 36:
1643 return AMDGPU::SI_SPILL_A288_SAVE;
1644 case 40:
1645 return AMDGPU::SI_SPILL_A320_SAVE;
1646 case 44:
1647 return AMDGPU::SI_SPILL_A352_SAVE;
1648 case 48:
1649 return AMDGPU::SI_SPILL_A384_SAVE;
1650 case 64:
1651 return AMDGPU::SI_SPILL_A512_SAVE;
1652 case 128:
1653 return AMDGPU::SI_SPILL_A1024_SAVE;
1654 default:
1655 llvm_unreachable("unknown register size");
1656 }
1657}
1658
1659static unsigned getAVSpillSaveOpcode(unsigned Size) {
1660 switch (Size) {
1661 case 4:
1662 return AMDGPU::SI_SPILL_AV32_SAVE;
1663 case 8:
1664 return AMDGPU::SI_SPILL_AV64_SAVE;
1665 case 12:
1666 return AMDGPU::SI_SPILL_AV96_SAVE;
1667 case 16:
1668 return AMDGPU::SI_SPILL_AV128_SAVE;
1669 case 20:
1670 return AMDGPU::SI_SPILL_AV160_SAVE;
1671 case 24:
1672 return AMDGPU::SI_SPILL_AV192_SAVE;
1673 case 28:
1674 return AMDGPU::SI_SPILL_AV224_SAVE;
1675 case 32:
1676 return AMDGPU::SI_SPILL_AV256_SAVE;
1677 case 36:
1678 return AMDGPU::SI_SPILL_AV288_SAVE;
1679 case 40:
1680 return AMDGPU::SI_SPILL_AV320_SAVE;
1681 case 44:
1682 return AMDGPU::SI_SPILL_AV352_SAVE;
1683 case 48:
1684 return AMDGPU::SI_SPILL_AV384_SAVE;
1685 case 64:
1686 return AMDGPU::SI_SPILL_AV512_SAVE;
1687 case 128:
1688 return AMDGPU::SI_SPILL_AV1024_SAVE;
1689 default:
1690 llvm_unreachable("unknown register size");
1691 }
1692}
1693
1694static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1695 bool IsVectorSuperClass) {
1696 // Currently, there is only 32-bit WWM register spills needed.
1697 if (Size != 4)
1698 llvm_unreachable("unknown wwm register spill size");
1699
1700 if (IsVectorSuperClass)
1701 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1702
1703 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1704}
1705
1707 const TargetRegisterClass *RC,
1708 unsigned Size,
1709 const SIRegisterInfo &TRI,
1710 const SIMachineFunctionInfo &MFI) {
1711 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1712
1713 // Choose the right opcode if spilling a WWM register.
1715 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1716
1717 if (IsVectorSuperClass)
1718 return getAVSpillSaveOpcode(Size);
1719
1720 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1722}
1723
1726 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1727 const TargetRegisterInfo *TRI, Register VReg) const {
1730 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1731 const DebugLoc &DL = MBB.findDebugLoc(MI);
1732
1733 MachinePointerInfo PtrInfo
1734 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1736 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1737 FrameInfo.getObjectAlign(FrameIndex));
1738 unsigned SpillSize = TRI->getSpillSize(*RC);
1739
1741 if (RI.isSGPRClass(RC)) {
1742 MFI->setHasSpilledSGPRs();
1743 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1744 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1745 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1746
1747 // We are only allowed to create one new instruction when spilling
1748 // registers, so we need to use pseudo instruction for spilling SGPRs.
1749 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1750
1751 // The SGPR spill/restore instructions only work on number sgprs, so we need
1752 // to make sure we are using the correct register class.
1753 if (SrcReg.isVirtual() && SpillSize == 4) {
1754 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1755 }
1756
1757 BuildMI(MBB, MI, DL, OpDesc)
1758 .addReg(SrcReg, getKillRegState(isKill)) // data
1759 .addFrameIndex(FrameIndex) // addr
1760 .addMemOperand(MMO)
1762
1763 if (RI.spillSGPRToVGPR())
1764 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1765 return;
1766 }
1767
1768 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1769 SpillSize, RI, *MFI);
1770 MFI->setHasSpilledVGPRs();
1771
1772 BuildMI(MBB, MI, DL, get(Opcode))
1773 .addReg(SrcReg, getKillRegState(isKill)) // data
1774 .addFrameIndex(FrameIndex) // addr
1775 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1776 .addImm(0) // offset
1777 .addMemOperand(MMO);
1778}
1779
1780static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1781 switch (Size) {
1782 case 4:
1783 return AMDGPU::SI_SPILL_S32_RESTORE;
1784 case 8:
1785 return AMDGPU::SI_SPILL_S64_RESTORE;
1786 case 12:
1787 return AMDGPU::SI_SPILL_S96_RESTORE;
1788 case 16:
1789 return AMDGPU::SI_SPILL_S128_RESTORE;
1790 case 20:
1791 return AMDGPU::SI_SPILL_S160_RESTORE;
1792 case 24:
1793 return AMDGPU::SI_SPILL_S192_RESTORE;
1794 case 28:
1795 return AMDGPU::SI_SPILL_S224_RESTORE;
1796 case 32:
1797 return AMDGPU::SI_SPILL_S256_RESTORE;
1798 case 36:
1799 return AMDGPU::SI_SPILL_S288_RESTORE;
1800 case 40:
1801 return AMDGPU::SI_SPILL_S320_RESTORE;
1802 case 44:
1803 return AMDGPU::SI_SPILL_S352_RESTORE;
1804 case 48:
1805 return AMDGPU::SI_SPILL_S384_RESTORE;
1806 case 64:
1807 return AMDGPU::SI_SPILL_S512_RESTORE;
1808 case 128:
1809 return AMDGPU::SI_SPILL_S1024_RESTORE;
1810 default:
1811 llvm_unreachable("unknown register size");
1812 }
1813}
1814
1815static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1816 switch (Size) {
1817 case 4:
1818 return AMDGPU::SI_SPILL_V32_RESTORE;
1819 case 8:
1820 return AMDGPU::SI_SPILL_V64_RESTORE;
1821 case 12:
1822 return AMDGPU::SI_SPILL_V96_RESTORE;
1823 case 16:
1824 return AMDGPU::SI_SPILL_V128_RESTORE;
1825 case 20:
1826 return AMDGPU::SI_SPILL_V160_RESTORE;
1827 case 24:
1828 return AMDGPU::SI_SPILL_V192_RESTORE;
1829 case 28:
1830 return AMDGPU::SI_SPILL_V224_RESTORE;
1831 case 32:
1832 return AMDGPU::SI_SPILL_V256_RESTORE;
1833 case 36:
1834 return AMDGPU::SI_SPILL_V288_RESTORE;
1835 case 40:
1836 return AMDGPU::SI_SPILL_V320_RESTORE;
1837 case 44:
1838 return AMDGPU::SI_SPILL_V352_RESTORE;
1839 case 48:
1840 return AMDGPU::SI_SPILL_V384_RESTORE;
1841 case 64:
1842 return AMDGPU::SI_SPILL_V512_RESTORE;
1843 case 128:
1844 return AMDGPU::SI_SPILL_V1024_RESTORE;
1845 default:
1846 llvm_unreachable("unknown register size");
1847 }
1848}
1849
1850static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1851 switch (Size) {
1852 case 4:
1853 return AMDGPU::SI_SPILL_A32_RESTORE;
1854 case 8:
1855 return AMDGPU::SI_SPILL_A64_RESTORE;
1856 case 12:
1857 return AMDGPU::SI_SPILL_A96_RESTORE;
1858 case 16:
1859 return AMDGPU::SI_SPILL_A128_RESTORE;
1860 case 20:
1861 return AMDGPU::SI_SPILL_A160_RESTORE;
1862 case 24:
1863 return AMDGPU::SI_SPILL_A192_RESTORE;
1864 case 28:
1865 return AMDGPU::SI_SPILL_A224_RESTORE;
1866 case 32:
1867 return AMDGPU::SI_SPILL_A256_RESTORE;
1868 case 36:
1869 return AMDGPU::SI_SPILL_A288_RESTORE;
1870 case 40:
1871 return AMDGPU::SI_SPILL_A320_RESTORE;
1872 case 44:
1873 return AMDGPU::SI_SPILL_A352_RESTORE;
1874 case 48:
1875 return AMDGPU::SI_SPILL_A384_RESTORE;
1876 case 64:
1877 return AMDGPU::SI_SPILL_A512_RESTORE;
1878 case 128:
1879 return AMDGPU::SI_SPILL_A1024_RESTORE;
1880 default:
1881 llvm_unreachable("unknown register size");
1882 }
1883}
1884
1885static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1886 switch (Size) {
1887 case 4:
1888 return AMDGPU::SI_SPILL_AV32_RESTORE;
1889 case 8:
1890 return AMDGPU::SI_SPILL_AV64_RESTORE;
1891 case 12:
1892 return AMDGPU::SI_SPILL_AV96_RESTORE;
1893 case 16:
1894 return AMDGPU::SI_SPILL_AV128_RESTORE;
1895 case 20:
1896 return AMDGPU::SI_SPILL_AV160_RESTORE;
1897 case 24:
1898 return AMDGPU::SI_SPILL_AV192_RESTORE;
1899 case 28:
1900 return AMDGPU::SI_SPILL_AV224_RESTORE;
1901 case 32:
1902 return AMDGPU::SI_SPILL_AV256_RESTORE;
1903 case 36:
1904 return AMDGPU::SI_SPILL_AV288_RESTORE;
1905 case 40:
1906 return AMDGPU::SI_SPILL_AV320_RESTORE;
1907 case 44:
1908 return AMDGPU::SI_SPILL_AV352_RESTORE;
1909 case 48:
1910 return AMDGPU::SI_SPILL_AV384_RESTORE;
1911 case 64:
1912 return AMDGPU::SI_SPILL_AV512_RESTORE;
1913 case 128:
1914 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1915 default:
1916 llvm_unreachable("unknown register size");
1917 }
1918}
1919
1920static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1921 bool IsVectorSuperClass) {
1922 // Currently, there is only 32-bit WWM register spills needed.
1923 if (Size != 4)
1924 llvm_unreachable("unknown wwm register spill size");
1925
1926 if (IsVectorSuperClass)
1927 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1928
1929 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1930}
1931
1932static unsigned
1934 unsigned Size, const SIRegisterInfo &TRI,
1935 const SIMachineFunctionInfo &MFI) {
1936 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1937
1938 // Choose the right opcode if restoring a WWM register.
1940 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1941
1942 if (IsVectorSuperClass)
1944
1945 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1947}
1948
1951 Register DestReg, int FrameIndex,
1952 const TargetRegisterClass *RC,
1953 const TargetRegisterInfo *TRI,
1954 Register VReg) const {
1957 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1958 const DebugLoc &DL = MBB.findDebugLoc(MI);
1959 unsigned SpillSize = TRI->getSpillSize(*RC);
1960
1961 MachinePointerInfo PtrInfo
1962 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1963
1965 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1966 FrameInfo.getObjectAlign(FrameIndex));
1967
1968 if (RI.isSGPRClass(RC)) {
1969 MFI->setHasSpilledSGPRs();
1970 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1971 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1972 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1973
1974 // FIXME: Maybe this should not include a memoperand because it will be
1975 // lowered to non-memory instructions.
1976 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1977 if (DestReg.isVirtual() && SpillSize == 4) {
1979 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1980 }
1981
1982 if (RI.spillSGPRToVGPR())
1983 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1984 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1985 .addFrameIndex(FrameIndex) // addr
1986 .addMemOperand(MMO)
1988
1989 return;
1990 }
1991
1992 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1993 SpillSize, RI, *MFI);
1994 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1995 .addFrameIndex(FrameIndex) // vaddr
1996 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1997 .addImm(0) // offset
1998 .addMemOperand(MMO);
1999}
2000
2003 insertNoops(MBB, MI, 1);
2004}
2005
2008 unsigned Quantity) const {
2010 while (Quantity > 0) {
2011 unsigned Arg = std::min(Quantity, 8u);
2012 Quantity -= Arg;
2013 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2014 }
2015}
2016
2018 auto *MF = MBB.getParent();
2020
2021 assert(Info->isEntryFunction());
2022
2023 if (MBB.succ_empty()) {
2024 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2025 if (HasNoTerminator) {
2026 if (Info->returnsVoid()) {
2027 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2028 } else {
2029 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2030 }
2031 }
2032 }
2033}
2034
2038 const DebugLoc &DL) const {
2040 constexpr unsigned DoorbellIDMask = 0x3ff;
2041 constexpr unsigned ECQueueWaveAbort = 0x400;
2042
2043 MachineBasicBlock *TrapBB = &MBB;
2044 MachineBasicBlock *ContBB = &MBB;
2045 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2046
2047 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2048 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2049 TrapBB = MF->CreateMachineBasicBlock();
2050 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2051 MF->push_back(TrapBB);
2052 MBB.addSuccessor(TrapBB);
2053 }
2054
2055 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2056 // will be a nop.
2057 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2058 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2059 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2060 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2061 DoorbellReg)
2063 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2064 .addUse(AMDGPU::M0);
2065 Register DoorbellRegMasked =
2066 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2067 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2068 .addUse(DoorbellReg)
2069 .addImm(DoorbellIDMask);
2070 Register SetWaveAbortBit =
2071 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2072 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2073 .addUse(DoorbellRegMasked)
2074 .addImm(ECQueueWaveAbort);
2075 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2076 .addUse(SetWaveAbortBit);
2077 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2079 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2080 .addUse(AMDGPU::TTMP2);
2081 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2082 TrapBB->addSuccessor(HaltLoopBB);
2083
2084 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2085 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2086 .addMBB(HaltLoopBB);
2087 MF->push_back(HaltLoopBB);
2088 HaltLoopBB->addSuccessor(HaltLoopBB);
2089
2090 return ContBB;
2091}
2092
2094 switch (MI.getOpcode()) {
2095 default:
2096 if (MI.isMetaInstruction())
2097 return 0;
2098 return 1; // FIXME: Do wait states equal cycles?
2099
2100 case AMDGPU::S_NOP:
2101 return MI.getOperand(0).getImm() + 1;
2102 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2103 // hazard, even if one exist, won't really be visible. Should we handle it?
2104 }
2105}
2106
2108 MachineBasicBlock &MBB = *MI.getParent();
2110 switch (MI.getOpcode()) {
2111 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2112 case AMDGPU::S_MOV_B64_term:
2113 // This is only a terminator to get the correct spill code placement during
2114 // register allocation.
2115 MI.setDesc(get(AMDGPU::S_MOV_B64));
2116 break;
2117
2118 case AMDGPU::S_MOV_B32_term:
2119 // This is only a terminator to get the correct spill code placement during
2120 // register allocation.
2121 MI.setDesc(get(AMDGPU::S_MOV_B32));
2122 break;
2123
2124 case AMDGPU::S_XOR_B64_term:
2125 // This is only a terminator to get the correct spill code placement during
2126 // register allocation.
2127 MI.setDesc(get(AMDGPU::S_XOR_B64));
2128 break;
2129
2130 case AMDGPU::S_XOR_B32_term:
2131 // This is only a terminator to get the correct spill code placement during
2132 // register allocation.
2133 MI.setDesc(get(AMDGPU::S_XOR_B32));
2134 break;
2135 case AMDGPU::S_OR_B64_term:
2136 // This is only a terminator to get the correct spill code placement during
2137 // register allocation.
2138 MI.setDesc(get(AMDGPU::S_OR_B64));
2139 break;
2140 case AMDGPU::S_OR_B32_term:
2141 // This is only a terminator to get the correct spill code placement during
2142 // register allocation.
2143 MI.setDesc(get(AMDGPU::S_OR_B32));
2144 break;
2145
2146 case AMDGPU::S_ANDN2_B64_term:
2147 // This is only a terminator to get the correct spill code placement during
2148 // register allocation.
2149 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2150 break;
2151
2152 case AMDGPU::S_ANDN2_B32_term:
2153 // This is only a terminator to get the correct spill code placement during
2154 // register allocation.
2155 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2156 break;
2157
2158 case AMDGPU::S_AND_B64_term:
2159 // This is only a terminator to get the correct spill code placement during
2160 // register allocation.
2161 MI.setDesc(get(AMDGPU::S_AND_B64));
2162 break;
2163
2164 case AMDGPU::S_AND_B32_term:
2165 // This is only a terminator to get the correct spill code placement during
2166 // register allocation.
2167 MI.setDesc(get(AMDGPU::S_AND_B32));
2168 break;
2169
2170 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2171 // This is only a terminator to get the correct spill code placement during
2172 // register allocation.
2173 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2174 break;
2175
2176 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2177 // This is only a terminator to get the correct spill code placement during
2178 // register allocation.
2179 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2180 break;
2181
2182 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2183 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2184 break;
2185
2186 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2187 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2188 break;
2189
2190 case AMDGPU::V_MOV_B64_PSEUDO: {
2191 Register Dst = MI.getOperand(0).getReg();
2192 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2193 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2194
2195 const MachineOperand &SrcOp = MI.getOperand(1);
2196 // FIXME: Will this work for 64-bit floating point immediates?
2197 assert(!SrcOp.isFPImm());
2198 if (ST.hasMovB64()) {
2199 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2200 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2201 isUInt<32>(SrcOp.getImm()))
2202 break;
2203 }
2204 if (SrcOp.isImm()) {
2205 APInt Imm(64, SrcOp.getImm());
2206 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2207 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2208 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2209 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2211 .addImm(Lo.getSExtValue())
2213 .addImm(Lo.getSExtValue())
2214 .addImm(0) // op_sel_lo
2215 .addImm(0) // op_sel_hi
2216 .addImm(0) // neg_lo
2217 .addImm(0) // neg_hi
2218 .addImm(0); // clamp
2219 } else {
2220 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2221 .addImm(Lo.getSExtValue())
2223 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2224 .addImm(Hi.getSExtValue())
2226 }
2227 } else {
2228 assert(SrcOp.isReg());
2229 if (ST.hasPkMovB32() &&
2230 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2231 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2232 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2233 .addReg(SrcOp.getReg())
2235 .addReg(SrcOp.getReg())
2236 .addImm(0) // op_sel_lo
2237 .addImm(0) // op_sel_hi
2238 .addImm(0) // neg_lo
2239 .addImm(0) // neg_hi
2240 .addImm(0); // clamp
2241 } else {
2242 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2243 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2245 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2246 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2248 }
2249 }
2250 MI.eraseFromParent();
2251 break;
2252 }
2253 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2255 break;
2256 }
2257 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2258 const MachineOperand &SrcOp = MI.getOperand(1);
2259 assert(!SrcOp.isFPImm());
2260 APInt Imm(64, SrcOp.getImm());
2261 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2262 MI.setDesc(get(AMDGPU::S_MOV_B64));
2263 break;
2264 }
2265
2266 Register Dst = MI.getOperand(0).getReg();
2267 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2268 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2269
2270 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2271 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2272 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2273 .addImm(Lo.getSExtValue())
2275 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2276 .addImm(Hi.getSExtValue())
2278 MI.eraseFromParent();
2279 break;
2280 }
2281 case AMDGPU::V_SET_INACTIVE_B32: {
2282 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2283 Register DstReg = MI.getOperand(0).getReg();
2284 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2285 .add(MI.getOperand(3))
2286 .add(MI.getOperand(4))
2287 .add(MI.getOperand(1))
2288 .add(MI.getOperand(2))
2289 .add(MI.getOperand(5));
2290 MI.eraseFromParent();
2291 break;
2292 }
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2294 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2298 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2305 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2322 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2323
2324 unsigned Opc;
2325 if (RI.hasVGPRs(EltRC)) {
2326 Opc = AMDGPU::V_MOVRELD_B32_e32;
2327 } else {
2328 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2329 : AMDGPU::S_MOVRELD_B32;
2330 }
2331
2332 const MCInstrDesc &OpDesc = get(Opc);
2333 Register VecReg = MI.getOperand(0).getReg();
2334 bool IsUndef = MI.getOperand(1).isUndef();
2335 unsigned SubReg = MI.getOperand(3).getImm();
2336 assert(VecReg == MI.getOperand(1).getReg());
2337
2339 BuildMI(MBB, MI, DL, OpDesc)
2340 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2341 .add(MI.getOperand(2))
2343 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2344
2345 const int ImpDefIdx =
2346 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2347 const int ImpUseIdx = ImpDefIdx + 1;
2348 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2349 MI.eraseFromParent();
2350 break;
2351 }
2352 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2353 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2357 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2358 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2359 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2360 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2361 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2362 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2363 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2365 Register VecReg = MI.getOperand(0).getReg();
2366 bool IsUndef = MI.getOperand(1).isUndef();
2367 Register Idx = MI.getOperand(3).getReg();
2368 Register SubReg = MI.getOperand(4).getImm();
2369
2370 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2371 .addReg(Idx)
2373 SetOn->getOperand(3).setIsUndef();
2374
2375 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2377 BuildMI(MBB, MI, DL, OpDesc)
2378 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2379 .add(MI.getOperand(2))
2381 .addReg(VecReg,
2382 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2383
2384 const int ImpDefIdx =
2385 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2386 const int ImpUseIdx = ImpDefIdx + 1;
2387 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2388
2389 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2390
2391 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2392
2393 MI.eraseFromParent();
2394 break;
2395 }
2396 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2397 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2398 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2402 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2403 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2404 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2405 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2406 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2407 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2409 Register Dst = MI.getOperand(0).getReg();
2410 Register VecReg = MI.getOperand(1).getReg();
2411 bool IsUndef = MI.getOperand(1).isUndef();
2412 Register Idx = MI.getOperand(2).getReg();
2413 Register SubReg = MI.getOperand(3).getImm();
2414
2415 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2416 .addReg(Idx)
2418 SetOn->getOperand(3).setIsUndef();
2419
2420 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2421 .addDef(Dst)
2422 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2423 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2424
2425 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2426
2427 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2428
2429 MI.eraseFromParent();
2430 break;
2431 }
2432 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2433 MachineFunction &MF = *MBB.getParent();
2434 Register Reg = MI.getOperand(0).getReg();
2435 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2436 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2437 MachineOperand OpLo = MI.getOperand(1);
2438 MachineOperand OpHi = MI.getOperand(2);
2439
2440 // Create a bundle so these instructions won't be re-ordered by the
2441 // post-RA scheduler.
2442 MIBundleBuilder Bundler(MBB, MI);
2443 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2444
2445 // What we want here is an offset from the value returned by s_getpc (which
2446 // is the address of the s_add_u32 instruction) to the global variable, but
2447 // since the encoding of $symbol starts 4 bytes after the start of the
2448 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2449 // small. This requires us to add 4 to the global variable offset in order
2450 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2451 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2452 // instruction.
2453
2454 int64_t Adjust = 0;
2455 if (ST.hasGetPCZeroExtension()) {
2456 // Fix up hardware that does not sign-extend the 48-bit PC value by
2457 // inserting: s_sext_i32_i16 reghi, reghi
2458 Bundler.append(
2459 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2460 Adjust += 4;
2461 }
2462
2463 if (OpLo.isGlobal())
2464 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2465 Bundler.append(
2466 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2467
2468 if (OpHi.isGlobal())
2469 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2470 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2471 .addReg(RegHi)
2472 .add(OpHi));
2473
2474 finalizeBundle(MBB, Bundler.begin());
2475
2476 MI.eraseFromParent();
2477 break;
2478 }
2479 case AMDGPU::ENTER_STRICT_WWM: {
2480 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2481 // Whole Wave Mode is entered.
2482 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2483 : AMDGPU::S_OR_SAVEEXEC_B64));
2484 break;
2485 }
2486 case AMDGPU::ENTER_STRICT_WQM: {
2487 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2488 // STRICT_WQM is entered.
2489 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2490 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2491 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2492 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2493 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2494
2495 MI.eraseFromParent();
2496 break;
2497 }
2498 case AMDGPU::EXIT_STRICT_WWM:
2499 case AMDGPU::EXIT_STRICT_WQM: {
2500 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2501 // WWM/STICT_WQM is exited.
2502 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2503 break;
2504 }
2505 case AMDGPU::SI_RETURN: {
2506 const MachineFunction *MF = MBB.getParent();
2507 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2508 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2509 // Hiding the return address use with SI_RETURN may lead to extra kills in
2510 // the function and missing live-ins. We are fine in practice because callee
2511 // saved register handling ensures the register value is restored before
2512 // RET, but we need the undef flag here to appease the MachineVerifier
2513 // liveness checks.
2515 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2516 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2517
2518 MIB.copyImplicitOps(MI);
2519 MI.eraseFromParent();
2520 break;
2521 }
2522
2523 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2524 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2525 MI.setDesc(get(AMDGPU::S_MUL_U64));
2526 break;
2527
2528 case AMDGPU::S_GETPC_B64_pseudo:
2529 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2530 if (ST.hasGetPCZeroExtension()) {
2531 Register Dst = MI.getOperand(0).getReg();
2532 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2533 // Fix up hardware that does not sign-extend the 48-bit PC value by
2534 // inserting: s_sext_i32_i16 dsthi, dsthi
2535 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2536 DstHi)
2537 .addReg(DstHi);
2538 }
2539 break;
2540 }
2541 return true;
2542}
2543
2546 unsigned SubIdx, const MachineInstr &Orig,
2547 const TargetRegisterInfo &RI) const {
2548
2549 // Try shrinking the instruction to remat only the part needed for current
2550 // context.
2551 // TODO: Handle more cases.
2552 unsigned Opcode = Orig.getOpcode();
2553 switch (Opcode) {
2554 case AMDGPU::S_LOAD_DWORDX16_IMM:
2555 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2556 if (SubIdx != 0)
2557 break;
2558
2559 if (I == MBB.end())
2560 break;
2561
2562 if (I->isBundled())
2563 break;
2564
2565 // Look for a single use of the register that is also a subreg.
2566 Register RegToFind = Orig.getOperand(0).getReg();
2567 MachineOperand *UseMO = nullptr;
2568 for (auto &CandMO : I->operands()) {
2569 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2570 continue;
2571 if (UseMO) {
2572 UseMO = nullptr;
2573 break;
2574 }
2575 UseMO = &CandMO;
2576 }
2577 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2578 break;
2579
2580 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2581 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2582
2585 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2586
2587 unsigned NewOpcode = -1;
2588 if (SubregSize == 256)
2589 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2590 else if (SubregSize == 128)
2591 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2592 else
2593 break;
2594
2595 const MCInstrDesc &TID = get(NewOpcode);
2596 const TargetRegisterClass *NewRC =
2597 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2598 MRI.setRegClass(DestReg, NewRC);
2599
2600 UseMO->setReg(DestReg);
2601 UseMO->setSubReg(AMDGPU::NoSubRegister);
2602
2603 // Use a smaller load with the desired size, possibly with updated offset.
2604 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2605 MI->setDesc(TID);
2606 MI->getOperand(0).setReg(DestReg);
2607 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2608 if (Offset) {
2609 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2610 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2611 OffsetMO->setImm(FinalOffset);
2612 }
2614 for (const MachineMemOperand *MemOp : Orig.memoperands())
2615 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2616 SubregSize / 8));
2617 MI->setMemRefs(*MF, NewMMOs);
2618
2619 MBB.insert(I, MI);
2620 return;
2621 }
2622
2623 default:
2624 break;
2625 }
2626
2627 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2628}
2629
2630std::pair<MachineInstr*, MachineInstr*>
2632 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2633
2634 if (ST.hasMovB64() &&
2636 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2637 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2638 return std::pair(&MI, nullptr);
2639 }
2640
2641 MachineBasicBlock &MBB = *MI.getParent();
2645 Register Dst = MI.getOperand(0).getReg();
2646 unsigned Part = 0;
2647 MachineInstr *Split[2];
2648
2649 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2650 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2651 if (Dst.isPhysical()) {
2652 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2653 } else {
2654 assert(MRI.isSSA());
2655 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2656 MovDPP.addDef(Tmp);
2657 }
2658
2659 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2660 const MachineOperand &SrcOp = MI.getOperand(I);
2661 assert(!SrcOp.isFPImm());
2662 if (SrcOp.isImm()) {
2663 APInt Imm(64, SrcOp.getImm());
2664 Imm.ashrInPlace(Part * 32);
2665 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2666 } else {
2667 assert(SrcOp.isReg());
2668 Register Src = SrcOp.getReg();
2669 if (Src.isPhysical())
2670 MovDPP.addReg(RI.getSubReg(Src, Sub));
2671 else
2672 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2673 }
2674 }
2675
2676 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2677 MovDPP.addImm(MO.getImm());
2678
2679 Split[Part] = MovDPP;
2680 ++Part;
2681 }
2682
2683 if (Dst.isVirtual())
2684 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2685 .addReg(Split[0]->getOperand(0).getReg())
2686 .addImm(AMDGPU::sub0)
2687 .addReg(Split[1]->getOperand(0).getReg())
2688 .addImm(AMDGPU::sub1);
2689
2690 MI.eraseFromParent();
2691 return std::pair(Split[0], Split[1]);
2692}
2693
2694std::optional<DestSourcePair>
2696 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2697 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2698
2699 return std::nullopt;
2700}
2701
2703 MachineOperand &Src0,
2704 unsigned Src0OpName,
2705 MachineOperand &Src1,
2706 unsigned Src1OpName) const {
2707 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2708 if (!Src0Mods)
2709 return false;
2710
2711 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2712 assert(Src1Mods &&
2713 "All commutable instructions have both src0 and src1 modifiers");
2714
2715 int Src0ModsVal = Src0Mods->getImm();
2716 int Src1ModsVal = Src1Mods->getImm();
2717
2718 Src1Mods->setImm(Src0ModsVal);
2719 Src0Mods->setImm(Src1ModsVal);
2720 return true;
2721}
2722
2724 MachineOperand &RegOp,
2725 MachineOperand &NonRegOp) {
2726 Register Reg = RegOp.getReg();
2727 unsigned SubReg = RegOp.getSubReg();
2728 bool IsKill = RegOp.isKill();
2729 bool IsDead = RegOp.isDead();
2730 bool IsUndef = RegOp.isUndef();
2731 bool IsDebug = RegOp.isDebug();
2732
2733 if (NonRegOp.isImm())
2734 RegOp.ChangeToImmediate(NonRegOp.getImm());
2735 else if (NonRegOp.isFI())
2736 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2737 else if (NonRegOp.isGlobal()) {
2738 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2739 NonRegOp.getTargetFlags());
2740 } else
2741 return nullptr;
2742
2743 // Make sure we don't reinterpret a subreg index in the target flags.
2744 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2745
2746 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2747 NonRegOp.setSubReg(SubReg);
2748
2749 return &MI;
2750}
2751
2753 unsigned Src0Idx,
2754 unsigned Src1Idx) const {
2755 assert(!NewMI && "this should never be used");
2756
2757 unsigned Opc = MI.getOpcode();
2758 int CommutedOpcode = commuteOpcode(Opc);
2759 if (CommutedOpcode == -1)
2760 return nullptr;
2761
2762 if (Src0Idx > Src1Idx)
2763 std::swap(Src0Idx, Src1Idx);
2764
2765 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2766 static_cast<int>(Src0Idx) &&
2767 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2768 static_cast<int>(Src1Idx) &&
2769 "inconsistency with findCommutedOpIndices");
2770
2771 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2772 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2773
2774 MachineInstr *CommutedMI = nullptr;
2775 if (Src0.isReg() && Src1.isReg()) {
2776 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2777 // Be sure to copy the source modifiers to the right place.
2778 CommutedMI
2779 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2780 }
2781
2782 } else if (Src0.isReg() && !Src1.isReg()) {
2783 if (isOperandLegal(MI, Src1Idx, &Src0))
2784 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2785 } else if (!Src0.isReg() && Src1.isReg()) {
2786 if (isOperandLegal(MI, Src1Idx, &Src0))
2787 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2788 } else {
2789 // FIXME: Found two non registers to commute. This does happen.
2790 return nullptr;
2791 }
2792
2793 if (CommutedMI) {
2794 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2795 Src1, AMDGPU::OpName::src1_modifiers);
2796
2797 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2798 AMDGPU::OpName::src1_sel);
2799
2800 CommutedMI->setDesc(get(CommutedOpcode));
2801 }
2802
2803 return CommutedMI;
2804}
2805
2806// This needs to be implemented because the source modifiers may be inserted
2807// between the true commutable operands, and the base
2808// TargetInstrInfo::commuteInstruction uses it.
2810 unsigned &SrcOpIdx0,
2811 unsigned &SrcOpIdx1) const {
2812 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2813}
2814
2816 unsigned &SrcOpIdx0,
2817 unsigned &SrcOpIdx1) const {
2818 if (!Desc.isCommutable())
2819 return false;
2820
2821 unsigned Opc = Desc.getOpcode();
2822 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2823 if (Src0Idx == -1)
2824 return false;
2825
2826 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2827 if (Src1Idx == -1)
2828 return false;
2829
2830 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2831}
2832
2834 int64_t BrOffset) const {
2835 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2836 // block is unanalyzable.
2837 assert(BranchOp != AMDGPU::S_SETPC_B64);
2838
2839 // Convert to dwords.
2840 BrOffset /= 4;
2841
2842 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2843 // from the next instruction.
2844 BrOffset -= 1;
2845
2846 return isIntN(BranchOffsetBits, BrOffset);
2847}
2848
2851 return MI.getOperand(0).getMBB();
2852}
2853
2855 for (const MachineInstr &MI : MBB->terminators()) {
2856 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2857 MI.getOpcode() == AMDGPU::SI_LOOP)
2858 return true;
2859 }
2860 return false;
2861}
2862
2864 MachineBasicBlock &DestBB,
2865 MachineBasicBlock &RestoreBB,
2866 const DebugLoc &DL, int64_t BrOffset,
2867 RegScavenger *RS) const {
2868 assert(RS && "RegScavenger required for long branching");
2869 assert(MBB.empty() &&
2870 "new block should be inserted for expanding unconditional branch");
2871 assert(MBB.pred_size() == 1);
2872 assert(RestoreBB.empty() &&
2873 "restore block should be inserted for restoring clobbered registers");
2874
2878
2879 // FIXME: Virtual register workaround for RegScavenger not working with empty
2880 // blocks.
2881 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2882
2883 auto I = MBB.end();
2884
2885 // Note: as this is used after hazard recognizer we need to apply some hazard
2886 // workarounds directly.
2887 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2889 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2890 if (FlushSGPRWrites)
2891 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2893 };
2894
2895 // We need to compute the offset relative to the instruction immediately after
2896 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2897 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2898 ApplyHazardWorkarounds();
2899
2900 auto &MCCtx = MF->getContext();
2901 MCSymbol *PostGetPCLabel =
2902 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2903 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2904
2905 MCSymbol *OffsetLo =
2906 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2907 MCSymbol *OffsetHi =
2908 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2909 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2910 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2911 .addReg(PCReg, 0, AMDGPU::sub0)
2912 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2913 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2914 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2915 .addReg(PCReg, 0, AMDGPU::sub1)
2916 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2917 ApplyHazardWorkarounds();
2918
2919 // Insert the indirect branch after the other terminator.
2920 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2921 .addReg(PCReg);
2922
2923 // If a spill is needed for the pc register pair, we need to insert a spill
2924 // restore block right before the destination block, and insert a short branch
2925 // into the old destination block's fallthrough predecessor.
2926 // e.g.:
2927 //
2928 // s_cbranch_scc0 skip_long_branch:
2929 //
2930 // long_branch_bb:
2931 // spill s[8:9]
2932 // s_getpc_b64 s[8:9]
2933 // s_add_u32 s8, s8, restore_bb
2934 // s_addc_u32 s9, s9, 0
2935 // s_setpc_b64 s[8:9]
2936 //
2937 // skip_long_branch:
2938 // foo;
2939 //
2940 // .....
2941 //
2942 // dest_bb_fallthrough_predecessor:
2943 // bar;
2944 // s_branch dest_bb
2945 //
2946 // restore_bb:
2947 // restore s[8:9]
2948 // fallthrough dest_bb
2949 ///
2950 // dest_bb:
2951 // buzz;
2952
2953 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2954 Register Scav;
2955
2956 // If we've previously reserved a register for long branches
2957 // avoid running the scavenger and just use those registers
2958 if (LongBranchReservedReg) {
2959 RS->enterBasicBlock(MBB);
2960 Scav = LongBranchReservedReg;
2961 } else {
2963 Scav = RS->scavengeRegisterBackwards(
2964 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2965 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2966 }
2967 if (Scav) {
2968 RS->setRegUsed(Scav);
2969 MRI.replaceRegWith(PCReg, Scav);
2970 MRI.clearVirtRegs();
2971 } else {
2972 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2973 // SGPR spill.
2974 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2975 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2976 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2977 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2978 MRI.clearVirtRegs();
2979 }
2980
2981 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2982 // Now, the distance could be defined.
2984 MCSymbolRefExpr::create(DestLabel, MCCtx),
2985 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2986 // Add offset assignments.
2987 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2988 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2989 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2990 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2991}
2992
2993unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2994 switch (Cond) {
2995 case SIInstrInfo::SCC_TRUE:
2996 return AMDGPU::S_CBRANCH_SCC1;
2997 case SIInstrInfo::SCC_FALSE:
2998 return AMDGPU::S_CBRANCH_SCC0;
2999 case SIInstrInfo::VCCNZ:
3000 return AMDGPU::S_CBRANCH_VCCNZ;
3001 case SIInstrInfo::VCCZ:
3002 return AMDGPU::S_CBRANCH_VCCZ;
3003 case SIInstrInfo::EXECNZ:
3004 return AMDGPU::S_CBRANCH_EXECNZ;
3005 case SIInstrInfo::EXECZ:
3006 return AMDGPU::S_CBRANCH_EXECZ;
3007 default:
3008 llvm_unreachable("invalid branch predicate");
3009 }
3010}
3011
3012SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3013 switch (Opcode) {
3014 case AMDGPU::S_CBRANCH_SCC0:
3015 return SCC_FALSE;
3016 case AMDGPU::S_CBRANCH_SCC1:
3017 return SCC_TRUE;
3018 case AMDGPU::S_CBRANCH_VCCNZ:
3019 return VCCNZ;
3020 case AMDGPU::S_CBRANCH_VCCZ:
3021 return VCCZ;
3022 case AMDGPU::S_CBRANCH_EXECNZ:
3023 return EXECNZ;
3024 case AMDGPU::S_CBRANCH_EXECZ:
3025 return EXECZ;
3026 default:
3027 return INVALID_BR;
3028 }
3029}
3030
3034 MachineBasicBlock *&FBB,
3036 bool AllowModify) const {
3037 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3038 // Unconditional Branch
3039 TBB = I->getOperand(0).getMBB();
3040 return false;
3041 }
3042
3043 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3044 if (Pred == INVALID_BR)
3045 return true;
3046
3047 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3048 Cond.push_back(MachineOperand::CreateImm(Pred));
3049 Cond.push_back(I->getOperand(1)); // Save the branch register.
3050
3051 ++I;
3052
3053 if (I == MBB.end()) {
3054 // Conditional branch followed by fall-through.
3055 TBB = CondBB;
3056 return false;
3057 }
3058
3059 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3060 TBB = CondBB;
3061 FBB = I->getOperand(0).getMBB();
3062 return false;
3063 }
3064
3065 return true;
3066}
3067
3069 MachineBasicBlock *&FBB,
3071 bool AllowModify) const {
3073 auto E = MBB.end();
3074 if (I == E)
3075 return false;
3076
3077 // Skip over the instructions that are artificially terminators for special
3078 // exec management.
3079 while (I != E && !I->isBranch() && !I->isReturn()) {
3080 switch (I->getOpcode()) {
3081 case AMDGPU::S_MOV_B64_term:
3082 case AMDGPU::S_XOR_B64_term:
3083 case AMDGPU::S_OR_B64_term:
3084 case AMDGPU::S_ANDN2_B64_term:
3085 case AMDGPU::S_AND_B64_term:
3086 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3087 case AMDGPU::S_MOV_B32_term:
3088 case AMDGPU::S_XOR_B32_term:
3089 case AMDGPU::S_OR_B32_term:
3090 case AMDGPU::S_ANDN2_B32_term:
3091 case AMDGPU::S_AND_B32_term:
3092 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3093 break;
3094 case AMDGPU::SI_IF:
3095 case AMDGPU::SI_ELSE:
3096 case AMDGPU::SI_KILL_I1_TERMINATOR:
3097 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3098 // FIXME: It's messy that these need to be considered here at all.
3099 return true;
3100 default:
3101 llvm_unreachable("unexpected non-branch terminator inst");
3102 }
3103
3104 ++I;
3105 }
3106
3107 if (I == E)
3108 return false;
3109
3110 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3111}
3112
3114 int *BytesRemoved) const {
3115 unsigned Count = 0;
3116 unsigned RemovedSize = 0;
3118 // Skip over artificial terminators when removing instructions.
3119 if (MI.isBranch() || MI.isReturn()) {
3120 RemovedSize += getInstSizeInBytes(MI);
3121 MI.eraseFromParent();
3122 ++Count;
3123 }
3124 }
3125
3126 if (BytesRemoved)
3127 *BytesRemoved = RemovedSize;
3128
3129 return Count;
3130}
3131
3132// Copy the flags onto the implicit condition register operand.
3134 const MachineOperand &OrigCond) {
3135 CondReg.setIsUndef(OrigCond.isUndef());
3136 CondReg.setIsKill(OrigCond.isKill());
3137}
3138
3141 MachineBasicBlock *FBB,
3143 const DebugLoc &DL,
3144 int *BytesAdded) const {
3145 if (!FBB && Cond.empty()) {
3146 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3147 .addMBB(TBB);
3148 if (BytesAdded)
3149 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3150 return 1;
3151 }
3152
3153 assert(TBB && Cond[0].isImm());
3154
3155 unsigned Opcode
3156 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3157
3158 if (!FBB) {
3159 MachineInstr *CondBr =
3160 BuildMI(&MBB, DL, get(Opcode))
3161 .addMBB(TBB);
3162
3163 // Copy the flags onto the implicit condition register operand.
3164 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3165 fixImplicitOperands(*CondBr);
3166
3167 if (BytesAdded)
3168 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3169 return 1;
3170 }
3171
3172 assert(TBB && FBB);
3173
3174 MachineInstr *CondBr =
3175 BuildMI(&MBB, DL, get(Opcode))
3176 .addMBB(TBB);
3177 fixImplicitOperands(*CondBr);
3178 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3179 .addMBB(FBB);
3180
3181 MachineOperand &CondReg = CondBr->getOperand(1);
3182 CondReg.setIsUndef(Cond[1].isUndef());
3183 CondReg.setIsKill(Cond[1].isKill());
3184
3185 if (BytesAdded)
3186 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3187
3188 return 2;
3189}
3190
3193 if (Cond.size() != 2) {
3194 return true;
3195 }
3196
3197 if (Cond[0].isImm()) {
3198 Cond[0].setImm(-Cond[0].getImm());
3199 return false;
3200 }
3201
3202 return true;
3203}
3204
3207 Register DstReg, Register TrueReg,
3208 Register FalseReg, int &CondCycles,
3209 int &TrueCycles, int &FalseCycles) const {
3210 switch (Cond[0].getImm()) {
3211 case VCCNZ:
3212 case VCCZ: {
3214 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3215 if (MRI.getRegClass(FalseReg) != RC)
3216 return false;
3217
3218 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3219 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3220
3221 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3222 return RI.hasVGPRs(RC) && NumInsts <= 6;
3223 }
3224 case SCC_TRUE:
3225 case SCC_FALSE: {
3226 // FIXME: We could insert for VGPRs if we could replace the original compare
3227 // with a vector one.
3229 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3230 if (MRI.getRegClass(FalseReg) != RC)
3231 return false;
3232
3233 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3234
3235 // Multiples of 8 can do s_cselect_b64
3236 if (NumInsts % 2 == 0)
3237 NumInsts /= 2;
3238
3239 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3240 return RI.isSGPRClass(RC);
3241 }
3242 default:
3243 return false;
3244 }
3245}
3246
3250 Register TrueReg, Register FalseReg) const {
3251 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3252 if (Pred == VCCZ || Pred == SCC_FALSE) {
3253 Pred = static_cast<BranchPredicate>(-Pred);
3254 std::swap(TrueReg, FalseReg);
3255 }
3256
3258 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3259 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3260
3261 if (DstSize == 32) {
3263 if (Pred == SCC_TRUE) {
3264 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3265 .addReg(TrueReg)
3266 .addReg(FalseReg);
3267 } else {
3268 // Instruction's operands are backwards from what is expected.
3269 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3270 .addReg(FalseReg)
3271 .addReg(TrueReg);
3272 }
3273
3274 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3275 return;
3276 }
3277
3278 if (DstSize == 64 && Pred == SCC_TRUE) {
3280 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3281 .addReg(TrueReg)
3282 .addReg(FalseReg);
3283
3284 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3285 return;
3286 }
3287
3288 static const int16_t Sub0_15[] = {
3289 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3290 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3291 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3292 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3293 };
3294
3295 static const int16_t Sub0_15_64[] = {
3296 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3297 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3298 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3299 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3300 };
3301
3302 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3303 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3304 const int16_t *SubIndices = Sub0_15;
3305 int NElts = DstSize / 32;
3306
3307 // 64-bit select is only available for SALU.
3308 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3309 if (Pred == SCC_TRUE) {
3310 if (NElts % 2) {
3311 SelOp = AMDGPU::S_CSELECT_B32;
3312 EltRC = &AMDGPU::SGPR_32RegClass;
3313 } else {
3314 SelOp = AMDGPU::S_CSELECT_B64;
3315 EltRC = &AMDGPU::SGPR_64RegClass;
3316 SubIndices = Sub0_15_64;
3317 NElts /= 2;
3318 }
3319 }
3320
3322 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3323
3324 I = MIB->getIterator();
3325
3327 for (int Idx = 0; Idx != NElts; ++Idx) {
3328 Register DstElt = MRI.createVirtualRegister(EltRC);
3329 Regs.push_back(DstElt);
3330
3331 unsigned SubIdx = SubIndices[Idx];
3332
3334 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3335 Select =
3336 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3337 .addReg(FalseReg, 0, SubIdx)
3338 .addReg(TrueReg, 0, SubIdx);
3339 } else {
3340 Select =
3341 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3342 .addReg(TrueReg, 0, SubIdx)
3343 .addReg(FalseReg, 0, SubIdx);
3344 }
3345
3346 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3348
3349 MIB.addReg(DstElt)
3350 .addImm(SubIdx);
3351 }
3352}
3353
3355 switch (MI.getOpcode()) {
3356 case AMDGPU::V_MOV_B16_t16_e32:
3357 case AMDGPU::V_MOV_B16_t16_e64:
3358 case AMDGPU::V_MOV_B32_e32:
3359 case AMDGPU::V_MOV_B32_e64:
3360 case AMDGPU::V_MOV_B64_PSEUDO:
3361 case AMDGPU::V_MOV_B64_e32:
3362 case AMDGPU::V_MOV_B64_e64:
3363 case AMDGPU::S_MOV_B32:
3364 case AMDGPU::S_MOV_B64:
3365 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3366 case AMDGPU::COPY:
3367 case AMDGPU::WWM_COPY:
3368 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3369 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3370 case AMDGPU::V_ACCVGPR_MOV_B32:
3371 return true;
3372 default:
3373 return false;
3374 }
3375}
3376
3377static constexpr unsigned ModifierOpNames[] = {
3378 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3379 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3380 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3381
3383 unsigned Opc = MI.getOpcode();
3384 for (unsigned Name : reverse(ModifierOpNames)) {
3386 if (Idx >= 0)
3387 MI.removeOperand(Idx);
3388 }
3389}
3390
3392 Register Reg, MachineRegisterInfo *MRI) const {
3393 if (!MRI->hasOneNonDBGUse(Reg))
3394 return false;
3395
3396 switch (DefMI.getOpcode()) {
3397 default:
3398 return false;
3399 case AMDGPU::V_MOV_B64_e32:
3400 case AMDGPU::S_MOV_B64:
3401 case AMDGPU::V_MOV_B64_PSEUDO:
3402 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3403 case AMDGPU::V_MOV_B32_e32:
3404 case AMDGPU::S_MOV_B32:
3405 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3406 break;
3407 }
3408
3409 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3410 assert(ImmOp);
3411 // FIXME: We could handle FrameIndex values here.
3412 if (!ImmOp->isImm())
3413 return false;
3414
3415 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3416 int64_t Imm = ImmOp->getImm();
3417 switch (UseOp.getSubReg()) {
3418 default:
3419 return Imm;
3420 case AMDGPU::sub0:
3421 return Lo_32(Imm);
3422 case AMDGPU::sub1:
3423 return Hi_32(Imm);
3424 case AMDGPU::lo16:
3425 return SignExtend64<16>(Imm);
3426 case AMDGPU::hi16:
3427 return SignExtend64<16>(Imm >> 16);
3428 case AMDGPU::sub1_lo16:
3429 return SignExtend64<16>(Imm >> 32);
3430 case AMDGPU::sub1_hi16:
3431 return SignExtend64<16>(Imm >> 48);
3432 }
3433 };
3434
3435 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3436
3437 unsigned Opc = UseMI.getOpcode();
3438 if (Opc == AMDGPU::COPY) {
3439 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3440
3441 Register DstReg = UseMI.getOperand(0).getReg();
3442 unsigned OpSize = getOpSize(UseMI, 0);
3443 bool Is16Bit = OpSize == 2;
3444 bool Is64Bit = OpSize == 8;
3445 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3446 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3447 : AMDGPU::V_MOV_B32_e32
3448 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3449 : AMDGPU::S_MOV_B32;
3450 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)),
3451 /*isSigned=*/true, /*implicitTrunc=*/true);
3452
3453 if (RI.isAGPR(*MRI, DstReg)) {
3454 if (Is64Bit || !isInlineConstant(Imm))
3455 return false;
3456 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3457 }
3458
3459 if (Is16Bit) {
3460 if (isVGPRCopy)
3461 return false; // Do not clobber vgpr_hi16
3462
3463 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3464 return false;
3465
3466 UseMI.getOperand(0).setSubReg(0);
3467 if (DstReg.isPhysical()) {
3468 DstReg = RI.get32BitRegister(DstReg);
3469 UseMI.getOperand(0).setReg(DstReg);
3470 }
3471 assert(UseMI.getOperand(1).getReg().isVirtual());
3472 }
3473
3474 const MCInstrDesc &NewMCID = get(NewOpc);
3475 if (DstReg.isPhysical() &&
3476 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3477 return false;
3478
3479 UseMI.setDesc(NewMCID);
3480 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3481 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3482 return true;
3483 }
3484
3485 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3486 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3487 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3488 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3489 Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
3490 // Don't fold if we are using source or output modifiers. The new VOP2
3491 // instructions don't have them.
3493 return false;
3494
3495 // If this is a free constant, there's no reason to do this.
3496 // TODO: We could fold this here instead of letting SIFoldOperands do it
3497 // later.
3498 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3499
3500 // Any src operand can be used for the legality check.
3501 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3502 return false;
3503
3504 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3505 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3506 bool IsFMA =
3507 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3508 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3509 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3510 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3511 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3512
3513 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3514 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3515 (Src1->isReg() && Src1->getReg() == Reg)) {
3516 MachineOperand *RegSrc =
3517 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3518 if (!RegSrc->isReg())
3519 return false;
3520 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3521 ST.getConstantBusLimit(Opc) < 2)
3522 return false;
3523
3524 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3525 return false;
3526
3527 // If src2 is also a literal constant then we have to choose which one to
3528 // fold. In general it is better to choose madak so that the other literal
3529 // can be materialized in an sgpr instead of a vgpr:
3530 // s_mov_b32 s0, literal
3531 // v_madak_f32 v0, s0, v0, literal
3532 // Instead of:
3533 // v_mov_b32 v1, literal
3534 // v_madmk_f32 v0, v0, literal, v1
3535 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3536 if (Def && Def->isMoveImmediate() &&
3537 !isInlineConstant(Def->getOperand(1)))
3538 return false;
3539
3540 unsigned NewOpc =
3541 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3542 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3543 : AMDGPU::V_FMAMK_F16)
3544 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3545 if (pseudoToMCOpcode(NewOpc) == -1)
3546 return false;
3547
3548 // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3549 // would also require restricting their register classes. For now
3550 // just bail out.
3551 if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3552 return false;
3553
3554 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3555
3556 // FIXME: This would be a lot easier if we could return a new instruction
3557 // instead of having to modify in place.
3558
3559 Register SrcReg = RegSrc->getReg();
3560 unsigned SrcSubReg = RegSrc->getSubReg();
3561 Src0->setReg(SrcReg);
3562 Src0->setSubReg(SrcSubReg);
3563 Src0->setIsKill(RegSrc->isKill());
3564
3565 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3566 Opc == AMDGPU::V_FMAC_F32_e64 ||
3567 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3568 UseMI.untieRegOperand(
3569 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3570
3571 Src1->ChangeToImmediate(Imm);
3572
3574 UseMI.setDesc(get(NewOpc));
3575
3576 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3577 if (DeleteDef)
3578 DefMI.eraseFromParent();
3579
3580 return true;
3581 }
3582
3583 // Added part is the constant: Use v_madak_{f16, f32}.
3584 if (Src2->isReg() && Src2->getReg() == Reg) {
3585 if (ST.getConstantBusLimit(Opc) < 2) {
3586 // Not allowed to use constant bus for another operand.
3587 // We can however allow an inline immediate as src0.
3588 bool Src0Inlined = false;
3589 if (Src0->isReg()) {
3590 // Try to inline constant if possible.
3591 // If the Def moves immediate and the use is single
3592 // We are saving VGPR here.
3593 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3594 if (Def && Def->isMoveImmediate() &&
3595 isInlineConstant(Def->getOperand(1)) &&
3596 MRI->hasOneUse(Src0->getReg())) {
3597 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3598 Src0Inlined = true;
3599 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3600 RI.isSGPRReg(*MRI, Src0->getReg())) {
3601 return false;
3602 }
3603 // VGPR is okay as Src0 - fallthrough
3604 }
3605
3606 if (Src1->isReg() && !Src0Inlined) {
3607 // We have one slot for inlinable constant so far - try to fill it
3608 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3609 if (Def && Def->isMoveImmediate() &&
3610 isInlineConstant(Def->getOperand(1)) &&
3611 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3612 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3613 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3614 return false;
3615 // VGPR is okay as Src1 - fallthrough
3616 }
3617 }
3618
3619 unsigned NewOpc =
3620 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3621 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3622 : AMDGPU::V_FMAAK_F16)
3623 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3624 if (pseudoToMCOpcode(NewOpc) == -1)
3625 return false;
3626
3627 // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3628 // would also require restricting their register classes. For now
3629 // just bail out.
3630 if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3631 return false;
3632
3633 // FIXME: This would be a lot easier if we could return a new instruction
3634 // instead of having to modify in place.
3635
3636 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3637 Opc == AMDGPU::V_FMAC_F32_e64 ||
3638 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3639 UseMI.untieRegOperand(
3640 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3641
3642 // ChangingToImmediate adds Src2 back to the instruction.
3643 Src2->ChangeToImmediate(getImmFor(*Src2));
3644
3645 // These come before src2.
3647 UseMI.setDesc(get(NewOpc));
3648 // It might happen that UseMI was commuted
3649 // and we now have SGPR as SRC1. If so 2 inlined
3650 // constant and SGPR are illegal.
3652
3653 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3654 if (DeleteDef)
3655 DefMI.eraseFromParent();
3656
3657 return true;
3658 }
3659 }
3660
3661 return false;
3662}
3663
3664static bool
3667 if (BaseOps1.size() != BaseOps2.size())
3668 return false;
3669 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3670 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3671 return false;
3672 }
3673 return true;
3674}
3675
3676static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3677 LocationSize WidthB, int OffsetB) {
3678 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3679 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3680 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3681 return LowWidth.hasValue() &&
3682 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3683}
3684
3685bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3686 const MachineInstr &MIb) const {
3687 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3688 int64_t Offset0, Offset1;
3689 LocationSize Dummy0 = 0, Dummy1 = 0;
3690 bool Offset0IsScalable, Offset1IsScalable;
3691 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3692 Dummy0, &RI) ||
3693 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3694 Dummy1, &RI))
3695 return false;
3696
3697 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3698 return false;
3699
3700 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3701 // FIXME: Handle ds_read2 / ds_write2.
3702 return false;
3703 }
3704 LocationSize Width0 = MIa.memoperands().front()->getSize();
3705 LocationSize Width1 = MIb.memoperands().front()->getSize();
3706 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3707}
3708
3710 const MachineInstr &MIb) const {
3711 assert(MIa.mayLoadOrStore() &&
3712 "MIa must load from or modify a memory location");
3713 assert(MIb.mayLoadOrStore() &&
3714 "MIb must load from or modify a memory location");
3715
3717 return false;
3718
3719 // XXX - Can we relax this between address spaces?
3720 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3721 return false;
3722
3723 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3724 return false;
3725
3726 // TODO: Should we check the address space from the MachineMemOperand? That
3727 // would allow us to distinguish objects we know don't alias based on the
3728 // underlying address space, even if it was lowered to a different one,
3729 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3730 // buffer.
3731 if (isDS(MIa)) {
3732 if (isDS(MIb))
3733 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3734
3735 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3736 }
3737
3738 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3739 if (isMUBUF(MIb) || isMTBUF(MIb))
3740 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3741
3742 if (isFLAT(MIb))
3743 return isFLATScratch(MIb);
3744
3745 return !isSMRD(MIb);
3746 }
3747
3748 if (isSMRD(MIa)) {
3749 if (isSMRD(MIb))
3750 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3751
3752 if (isFLAT(MIb))
3753 return isFLATScratch(MIb);
3754
3755 return !isMUBUF(MIb) && !isMTBUF(MIb);
3756 }
3757
3758 if (isFLAT(MIa)) {
3759 if (isFLAT(MIb)) {
3760 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3761 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3762 return true;
3763
3764 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3765 }
3766
3767 return false;
3768 }
3769
3770 return false;
3771}
3772
3774 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3775 if (Reg.isPhysical())
3776 return false;
3777 auto *Def = MRI.getUniqueVRegDef(Reg);
3778 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3779 Imm = Def->getOperand(1).getImm();
3780 if (DefMI)
3781 *DefMI = Def;
3782 return true;
3783 }
3784 return false;
3785}
3786
3787static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3788 MachineInstr **DefMI = nullptr) {
3789 if (!MO->isReg())
3790 return false;
3791 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3792 const MachineRegisterInfo &MRI = MF->getRegInfo();
3793 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3794}
3795
3797 MachineInstr &NewMI) {
3798 if (LV) {
3799 unsigned NumOps = MI.getNumOperands();
3800 for (unsigned I = 1; I < NumOps; ++I) {
3801 MachineOperand &Op = MI.getOperand(I);
3802 if (Op.isReg() && Op.isKill())
3803 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3804 }
3805 }
3806}
3807
3809 LiveVariables *LV,
3810 LiveIntervals *LIS) const {
3811 MachineBasicBlock &MBB = *MI.getParent();
3812 unsigned Opc = MI.getOpcode();
3813
3814 // Handle MFMA.
3815 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3816 if (NewMFMAOpc != -1) {
3818 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3819 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3820 MIB.add(MI.getOperand(I));
3821 updateLiveVariables(LV, MI, *MIB);
3822 if (LIS) {
3823 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3824 // SlotIndex of defs needs to be updated when converting to early-clobber
3825 MachineOperand &Def = MIB->getOperand(0);
3826 if (Def.isEarlyClobber() && Def.isReg() &&
3827 LIS->hasInterval(Def.getReg())) {
3828 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3829 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3830 auto &LI = LIS->getInterval(Def.getReg());
3831 auto UpdateDefIndex = [&](LiveRange &LR) {
3832 auto *S = LR.find(OldIndex);
3833 if (S != LR.end() && S->start == OldIndex) {
3834 assert(S->valno && S->valno->def == OldIndex);
3835 S->start = NewIndex;
3836 S->valno->def = NewIndex;
3837 }
3838 };
3839 UpdateDefIndex(LI);
3840 for (auto &SR : LI.subranges())
3841 UpdateDefIndex(SR);
3842 }
3843 }
3844 return MIB;
3845 }
3846
3847 if (SIInstrInfo::isWMMA(MI)) {
3848 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3849 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3850 .setMIFlags(MI.getFlags());
3851 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3852 MIB->addOperand(MI.getOperand(I));
3853
3854 updateLiveVariables(LV, MI, *MIB);
3855 if (LIS)
3856 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3857
3858 return MIB;
3859 }
3860
3861 assert(
3862 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3863 "V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3864 "pre-RA");
3865
3866 // Handle MAC/FMAC.
3867 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3868 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3869 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3870 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3871 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3872 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3873 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3874 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3875 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3876 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3877 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3878 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3879 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3880 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3881 bool Src0Literal = false;
3882
3883 switch (Opc) {
3884 default:
3885 return nullptr;
3886 case AMDGPU::V_MAC_F16_e64:
3887 case AMDGPU::V_FMAC_F16_e64:
3888 case AMDGPU::V_FMAC_F16_fake16_e64:
3889 case AMDGPU::V_MAC_F32_e64:
3890 case AMDGPU::V_MAC_LEGACY_F32_e64:
3891 case AMDGPU::V_FMAC_F32_e64:
3892 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3893 case AMDGPU::V_FMAC_F64_e64:
3894 break;
3895 case AMDGPU::V_MAC_F16_e32:
3896 case AMDGPU::V_FMAC_F16_e32:
3897 case AMDGPU::V_MAC_F32_e32:
3898 case AMDGPU::V_MAC_LEGACY_F32_e32:
3899 case AMDGPU::V_FMAC_F32_e32:
3900 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3901 case AMDGPU::V_FMAC_F64_e32: {
3902 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3903 AMDGPU::OpName::src0);
3904 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3905 if (!Src0->isReg() && !Src0->isImm())
3906 return nullptr;
3907
3908 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3909 Src0Literal = true;
3910
3911 break;
3912 }
3913 }
3914
3916 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3917 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3918 const MachineOperand *Src0Mods =
3919 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3920 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3921 const MachineOperand *Src1Mods =
3922 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3923 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3924 const MachineOperand *Src2Mods =
3925 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3926 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3927 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3928 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3929
3930 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3931 !IsLegacy &&
3932 // If we have an SGPR input, we will violate the constant bus restriction.
3933 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3934 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3936 const auto killDef = [&]() -> void {
3938 // The only user is the instruction which will be killed.
3939 Register DefReg = DefMI->getOperand(0).getReg();
3940
3941 if (MRI.hasOneNonDBGUse(DefReg)) {
3942 // We cannot just remove the DefMI here, calling pass will crash.
3943 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3944 DefMI->getOperand(0).setIsDead(true);
3945 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3947 if (LV)
3948 LV->getVarInfo(DefReg).AliveBlocks.clear();
3949 }
3950
3951 if (LIS) {
3952 LiveInterval &DefLI = LIS->getInterval(DefReg);
3953
3954 // We cannot delete the original instruction here, so hack out the use
3955 // in the original instruction with a dummy register so we can use
3956 // shrinkToUses to deal with any multi-use edge cases. Other targets do
3957 // not have the complexity of deleting a use to consider here.
3958 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
3959 for (MachineOperand &MIOp : MI.uses()) {
3960 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
3961 MIOp.setIsUndef(true);
3962 MIOp.setReg(DummyReg);
3963 }
3964 }
3965
3966 LIS->shrinkToUses(&DefLI);
3967 }
3968 };
3969
3970 int64_t Imm;
3971 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3972 unsigned NewOpc =
3973 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3974 : AMDGPU::V_FMAAK_F16)
3975 : AMDGPU::V_FMAAK_F32)
3976 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3977 if (pseudoToMCOpcode(NewOpc) != -1) {
3978 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3979 .add(*Dst)
3980 .add(*Src0)
3981 .add(*Src1)
3982 .addImm(Imm)
3983 .setMIFlags(MI.getFlags());
3984 updateLiveVariables(LV, MI, *MIB);
3985 if (LIS)
3986 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3987 killDef();
3988 return MIB;
3989 }
3990 }
3991 unsigned NewOpc =
3992 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3993 : AMDGPU::V_FMAMK_F16)
3994 : AMDGPU::V_FMAMK_F32)
3995 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3996 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3997 if (pseudoToMCOpcode(NewOpc) != -1) {
3998 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3999 .add(*Dst)
4000 .add(*Src0)
4001 .addImm(Imm)
4002 .add(*Src2)
4003 .setMIFlags(MI.getFlags());
4004 updateLiveVariables(LV, MI, *MIB);
4005
4006 if (LIS)
4007 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4008 killDef();
4009 return MIB;
4010 }
4011 }
4012 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4013 if (Src0Literal) {
4014 Imm = Src0->getImm();
4015 DefMI = nullptr;
4016 }
4017 if (pseudoToMCOpcode(NewOpc) != -1 &&
4019 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4020 Src1)) {
4021 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4022 .add(*Dst)
4023 .add(*Src1)
4024 .addImm(Imm)
4025 .add(*Src2)
4026 .setMIFlags(MI.getFlags());
4027 updateLiveVariables(LV, MI, *MIB);
4028
4029 if (LIS)
4030 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4031 if (DefMI)
4032 killDef();
4033 return MIB;
4034 }
4035 }
4036 }
4037
4038 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4039 // if VOP3 does not allow a literal operand.
4040 if (Src0Literal && !ST.hasVOP3Literal())
4041 return nullptr;
4042
4043 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
4044 : IsF64 ? AMDGPU::V_FMA_F64_e64
4045 : IsLegacy
4046 ? AMDGPU::V_FMA_LEGACY_F32_e64
4047 : AMDGPU::V_FMA_F32_e64
4048 : IsF16 ? AMDGPU::V_MAD_F16_e64
4049 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
4050 : AMDGPU::V_MAD_F32_e64;
4051 if (pseudoToMCOpcode(NewOpc) == -1)
4052 return nullptr;
4053
4054 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4055 .add(*Dst)
4056 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4057 .add(*Src0)
4058 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4059 .add(*Src1)
4060 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4061 .add(*Src2)
4062 .addImm(Clamp ? Clamp->getImm() : 0)
4063 .addImm(Omod ? Omod->getImm() : 0)
4064 .setMIFlags(MI.getFlags());
4065 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4066 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4067 updateLiveVariables(LV, MI, *MIB);
4068 if (LIS)
4069 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4070 return MIB;
4071}
4072
4073// It's not generally safe to move VALU instructions across these since it will
4074// start using the register as a base index rather than directly.
4075// XXX - Why isn't hasSideEffects sufficient for these?
4077 switch (MI.getOpcode()) {
4078 case AMDGPU::S_SET_GPR_IDX_ON:
4079 case AMDGPU::S_SET_GPR_IDX_MODE:
4080 case AMDGPU::S_SET_GPR_IDX_OFF:
4081 return true;
4082 default:
4083 return false;
4084 }
4085}
4086
4088 const MachineBasicBlock *MBB,
4089 const MachineFunction &MF) const {
4090 // Skipping the check for SP writes in the base implementation. The reason it
4091 // was added was apparently due to compile time concerns.
4092 //
4093 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4094 // but is probably avoidable.
4095
4096 // Copied from base implementation.
4097 // Terminators and labels can't be scheduled around.
4098 if (MI.isTerminator() || MI.isPosition())
4099 return true;
4100
4101 // INLINEASM_BR can jump to another block
4102 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4103 return true;
4104
4105 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4106 return true;
4107
4108 // Target-independent instructions do not have an implicit-use of EXEC, even
4109 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4110 // boundaries prevents incorrect movements of such instructions.
4111 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4112 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4113 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4114 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4116}
4117
4119 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4120}
4121
4123 // Skip the full operand and register alias search modifiesRegister
4124 // does. There's only a handful of instructions that touch this, it's only an
4125 // implicit def, and doesn't alias any other registers.
4126 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4127}
4128
4130 unsigned Opcode = MI.getOpcode();
4131
4132 if (MI.mayStore() && isSMRD(MI))
4133 return true; // scalar store or atomic
4134
4135 // This will terminate the function when other lanes may need to continue.
4136 if (MI.isReturn())
4137 return true;
4138
4139 // These instructions cause shader I/O that may cause hardware lockups
4140 // when executed with an empty EXEC mask.
4141 //
4142 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4143 // EXEC = 0, but checking for that case here seems not worth it
4144 // given the typical code patterns.
4145 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4146 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4147 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4148 return true;
4149
4150 if (MI.isCall() || MI.isInlineAsm())
4151 return true; // conservative assumption
4152
4153 // Assume that barrier interactions are only intended with active lanes.
4154 if (isBarrier(Opcode))
4155 return true;
4156
4157 // A mode change is a scalar operation that influences vector instructions.
4159 return true;
4160
4161 // These are like SALU instructions in terms of effects, so it's questionable
4162 // whether we should return true for those.
4163 //
4164 // However, executing them with EXEC = 0 causes them to operate on undefined
4165 // data, which we avoid by returning true here.
4166 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4167 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4168 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4169 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4170 return true;
4171
4172 return false;
4173}
4174
4176 const MachineInstr &MI) const {
4177 if (MI.isMetaInstruction())
4178 return false;
4179
4180 // This won't read exec if this is an SGPR->SGPR copy.
4181 if (MI.isCopyLike()) {
4182 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4183 return true;
4184
4185 // Make sure this isn't copying exec as a normal operand
4186 return MI.readsRegister(AMDGPU::EXEC, &RI);
4187 }
4188
4189 // Make a conservative assumption about the callee.
4190 if (MI.isCall())
4191 return true;
4192
4193 // Be conservative with any unhandled generic opcodes.
4194 if (!isTargetSpecificOpcode(MI.getOpcode()))
4195 return true;
4196
4197 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4198}
4199
4200bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4201 switch (Imm.getBitWidth()) {
4202 case 1: // This likely will be a condition code mask.
4203 return true;
4204
4205 case 32:
4206 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4207 ST.hasInv2PiInlineImm());
4208 case 64:
4209 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4210 ST.hasInv2PiInlineImm());
4211 case 16:
4212 return ST.has16BitInsts() &&
4213 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4214 ST.hasInv2PiInlineImm());
4215 default:
4216 llvm_unreachable("invalid bitwidth");
4217 }
4218}
4219
4221 APInt IntImm = Imm.bitcastToAPInt();
4222 int64_t IntImmVal = IntImm.getSExtValue();
4223 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4224 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4225 default:
4226 llvm_unreachable("invalid fltSemantics");
4229 return isInlineConstant(IntImm);
4231 return ST.has16BitInsts() &&
4232 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4234 return ST.has16BitInsts() &&
4235 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4236 }
4237}
4238
4240 uint8_t OperandType) const {
4241 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4242 if (!MO.isImm())
4243 return false;
4244
4245 // MachineOperand provides no way to tell the true operand size, since it only
4246 // records a 64-bit value. We need to know the size to determine if a 32-bit
4247 // floating point immediate bit pattern is legal for an integer immediate. It
4248 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4249
4250 int64_t Imm = MO.getImm();
4251 switch (OperandType) {
4264 int32_t Trunc = static_cast<int32_t>(Imm);
4266 }
4273 ST.hasInv2PiInlineImm());
4277 // We would expect inline immediates to not be concerned with an integer/fp
4278 // distinction. However, in the case of 16-bit integer operations, the
4279 // "floating point" values appear to not work. It seems read the low 16-bits
4280 // of 32-bit immediates, which happens to always work for the integer
4281 // values.
4282 //
4283 // See llvm bugzilla 46302.
4284 //
4285 // TODO: Theoretically we could use op-sel to use the high bits of the
4286 // 32-bit FP values.
4304 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4305 // A few special case instructions have 16-bit operands on subtargets
4306 // where 16-bit instructions are not legal.
4307 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4308 // constants in these cases
4309 int16_t Trunc = static_cast<int16_t>(Imm);
4310 return ST.has16BitInsts() &&
4312 }
4313
4314 return false;
4315 }
4320 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4321 int16_t Trunc = static_cast<int16_t>(Imm);
4322 return ST.has16BitInsts() &&
4324 }
4325 return false;
4326 }
4329 return false;
4332 // Always embedded in the instruction for free.
4333 return true;
4343 // Just ignore anything else.
4344 return true;
4345 default:
4346 llvm_unreachable("invalid operand type");
4347 }
4348}
4349
4350static bool compareMachineOp(const MachineOperand &Op0,
4351 const MachineOperand &Op1) {
4352 if (Op0.getType() != Op1.getType())
4353 return false;
4354
4355 switch (Op0.getType()) {
4357 return Op0.getReg() == Op1.getReg();
4359 return Op0.getImm() == Op1.getImm();
4360 default:
4361 llvm_unreachable("Didn't expect to be comparing these operand types");
4362 }
4363}
4364
4366 const MachineOperand &MO) const {
4367 const MCInstrDesc &InstDesc = MI.getDesc();
4368 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4369
4370 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4371
4373 return true;
4374
4375 if (OpInfo.RegClass < 0)
4376 return false;
4377
4378 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4379 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4380 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4381 AMDGPU::OpName::src2))
4382 return false;
4383 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4384 }
4385
4386 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4387 return false;
4388
4389 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4390 return true;
4391
4392 return ST.hasVOP3Literal();
4393}
4394
4395bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4396 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4397 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4398 return false;
4399
4400 int Op32 = AMDGPU::getVOPe32(Opcode);
4401 if (Op32 == -1)
4402 return false;
4403
4404 return pseudoToMCOpcode(Op32) != -1;
4405}
4406
4407bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4408 // The src0_modifier operand is present on all instructions
4409 // that have modifiers.
4410
4411 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4412}
4413
4415 unsigned OpName) const {
4416 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4417 return Mods && Mods->getImm();
4418}
4419
4421 return any_of(ModifierOpNames,
4422 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4423}
4424
4426 const MachineRegisterInfo &MRI) const {
4427 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4428 // Can't shrink instruction with three operands.
4429 if (Src2) {
4430 switch (MI.getOpcode()) {
4431 default: return false;
4432
4433 case AMDGPU::V_ADDC_U32_e64:
4434 case AMDGPU::V_SUBB_U32_e64:
4435 case AMDGPU::V_SUBBREV_U32_e64: {
4436 const MachineOperand *Src1
4437 = getNamedOperand(MI, AMDGPU::OpName::src1);
4438 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4439 return false;
4440 // Additional verification is needed for sdst/src2.
4441 return true;
4442 }
4443 case AMDGPU::V_MAC_F16_e64:
4444 case AMDGPU::V_MAC_F32_e64:
4445 case AMDGPU::V_MAC_LEGACY_F32_e64:
4446 case AMDGPU::V_FMAC_F16_e64:
4447 case AMDGPU::V_FMAC_F16_fake16_e64:
4448 case AMDGPU::V_FMAC_F32_e64:
4449 case AMDGPU::V_FMAC_F64_e64:
4450 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4451 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4452 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4453 return false;
4454 break;
4455
4456 case AMDGPU::V_CNDMASK_B32_e64:
4457 break;
4458 }
4459 }
4460
4461 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4462 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4463 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4464 return false;
4465
4466 // We don't need to check src0, all input types are legal, so just make sure
4467 // src0 isn't using any modifiers.
4468 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4469 return false;
4470
4471 // Can it be shrunk to a valid 32 bit opcode?
4472 if (!hasVALU32BitEncoding(MI.getOpcode()))
4473 return false;
4474
4475 // Check output modifiers
4476 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4477 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4478 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4479 // TODO: Can we avoid checking bound_ctrl/fi here?
4480 // They are only used by permlane*_swap special case.
4481 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4482 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4483}
4484
4485// Set VCC operand with all flags from \p Orig, except for setting it as
4486// implicit.
4488 const MachineOperand &Orig) {
4489
4490 for (MachineOperand &Use : MI.implicit_operands()) {
4491 if (Use.isUse() &&
4492 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4493 Use.setIsUndef(Orig.isUndef());
4494 Use.setIsKill(Orig.isKill());
4495 return;
4496 }
4497 }
4498}
4499
4501 unsigned Op32) const {
4502 MachineBasicBlock *MBB = MI.getParent();
4503
4504 const MCInstrDesc &Op32Desc = get(Op32);
4505 MachineInstrBuilder Inst32 =
4506 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4507 .setMIFlags(MI.getFlags());
4508
4509 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4510 // For VOPC instructions, this is replaced by an implicit def of vcc.
4511
4512 // We assume the defs of the shrunk opcode are in the same order, and the
4513 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4514 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4515 Inst32.add(MI.getOperand(I));
4516
4517 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4518
4519 int Idx = MI.getNumExplicitDefs();
4520 for (const MachineOperand &Use : MI.explicit_uses()) {
4521 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4523 continue;
4524
4525 if (&Use == Src2) {
4526 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4527 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4528 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4529 // of vcc was already added during the initial BuildMI, but we
4530 // 1) may need to change vcc to vcc_lo to preserve the original register
4531 // 2) have to preserve the original flags.
4532 copyFlagsToImplicitVCC(*Inst32, *Src2);
4533 continue;
4534 }
4535 }
4536
4537 Inst32.add(Use);
4538 }
4539
4540 // FIXME: Losing implicit operands
4541 fixImplicitOperands(*Inst32);
4542 return Inst32;
4543}
4544
4546 const MachineOperand &MO,
4547 const MCOperandInfo &OpInfo) const {
4548 // Literal constants use the constant bus.
4549 if (!MO.isReg())
4550 return !isInlineConstant(MO, OpInfo);
4551
4552 if (!MO.isUse())
4553 return false;
4554
4555 if (MO.getReg().isVirtual())
4556 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4557
4558 // Null is free
4559 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4560 return false;
4561
4562 // SGPRs use the constant bus
4563 if (MO.isImplicit()) {
4564 return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
4565 MO.getReg() == AMDGPU::VCC_LO;
4566 }
4567 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4568 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4569}
4570
4572 for (const MachineOperand &MO : MI.implicit_operands()) {
4573 // We only care about reads.
4574 if (MO.isDef())
4575 continue;
4576
4577 switch (MO.getReg()) {
4578 case AMDGPU::VCC:
4579 case AMDGPU::VCC_LO:
4580 case AMDGPU::VCC_HI:
4581 case AMDGPU::M0:
4582 case AMDGPU::FLAT_SCR:
4583 return MO.getReg();
4584
4585 default:
4586 break;
4587 }
4588 }
4589
4590 return Register();
4591}
4592
4593static bool shouldReadExec(const MachineInstr &MI) {
4594 if (SIInstrInfo::isVALU(MI)) {
4595 switch (MI.getOpcode()) {
4596 case AMDGPU::V_READLANE_B32:
4597 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4598 case AMDGPU::V_WRITELANE_B32:
4599 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4600 return false;
4601 }
4602
4603 return true;
4604 }
4605
4606 if (MI.isPreISelOpcode() ||
4607 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4610 return false;
4611
4612 return true;
4613}
4614
4615static bool isRegOrFI(const MachineOperand &MO) {
4616 return MO.isReg() || MO.isFI();
4617}
4618
4619static bool isSubRegOf(const SIRegisterInfo &TRI,
4620 const MachineOperand &SuperVec,
4621 const MachineOperand &SubReg) {
4622 if (SubReg.getReg().isPhysical())
4623 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4624
4625 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4626 SubReg.getReg() == SuperVec.getReg();
4627}
4628
4629// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4630bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4631 const MachineRegisterInfo &MRI,
4632 StringRef &ErrInfo) const {
4633 Register DstReg = MI.getOperand(0).getReg();
4634 Register SrcReg = MI.getOperand(1).getReg();
4635 // This is a check for copy from vector register to SGPR
4636 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4637 ErrInfo = "illegal copy from vector register to SGPR";
4638 return false;
4639 }
4640 return true;
4641}
4642
4644 StringRef &ErrInfo) const {
4645 uint16_t Opcode = MI.getOpcode();
4646 const MachineFunction *MF = MI.getParent()->getParent();
4647 const MachineRegisterInfo &MRI = MF->getRegInfo();
4648
4649 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4650 // Find a better property to recognize the point where instruction selection
4651 // is just done.
4652 // We can only enforce this check after SIFixSGPRCopies pass so that the
4653 // illegal copies are legalized and thereafter we don't expect a pass
4654 // inserting similar copies.
4655 if (!MRI.isSSA() && MI.isCopy())
4656 return verifyCopy(MI, MRI, ErrInfo);
4657
4658 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4659 return true;
4660
4661 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4662 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4663 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4664 int Src3Idx = -1;
4665 if (Src0Idx == -1) {
4666 // VOPD V_DUAL_* instructions use different operand names.
4667 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4668 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4669 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4670 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4671 }
4672
4673 // Make sure the number of operands is correct.
4674 const MCInstrDesc &Desc = get(Opcode);
4675 if (!Desc.isVariadic() &&
4676 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4677 ErrInfo = "Instruction has wrong number of operands.";
4678 return false;
4679 }
4680
4681 if (MI.isInlineAsm()) {
4682 // Verify register classes for inlineasm constraints.
4683 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4684 I != E; ++I) {
4685 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4686 if (!RC)
4687 continue;
4688
4689 const MachineOperand &Op = MI.getOperand(I);
4690 if (!Op.isReg())
4691 continue;
4692
4693 Register Reg = Op.getReg();
4694 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4695 ErrInfo = "inlineasm operand has incorrect register class.";
4696 return false;
4697 }
4698 }
4699
4700 return true;
4701 }
4702
4703 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4704 ErrInfo = "missing memory operand from image instruction.";
4705 return false;
4706 }
4707
4708 // Make sure the register classes are correct.
4709 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4710 const MachineOperand &MO = MI.getOperand(i);
4711 if (MO.isFPImm()) {
4712 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4713 "all fp values to integers.";
4714 return false;
4715 }
4716
4717 int RegClass = Desc.operands()[i].RegClass;
4718
4719 switch (Desc.operands()[i].OperandType) {
4721 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4722 ErrInfo = "Illegal immediate value for operand.";
4723 return false;
4724 }
4725 break;
4730 break;
4742 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4743 ErrInfo = "Illegal immediate value for operand.";
4744 return false;
4745 }
4746 break;
4747 }
4749 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4750 ErrInfo = "Expected inline constant for operand.";
4751 return false;
4752 }
4753 break;
4756 // Check if this operand is an immediate.
4757 // FrameIndex operands will be replaced by immediates, so they are
4758 // allowed.
4759 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4760 ErrInfo = "Expected immediate, but got non-immediate";
4761 return false;
4762 }
4763 [[fallthrough]];
4764 default:
4765 continue;
4766 }
4767
4768 if (!MO.isReg())
4769 continue;
4770 Register Reg = MO.getReg();
4771 if (!Reg)
4772 continue;
4773
4774 // FIXME: Ideally we would have separate instruction definitions with the
4775 // aligned register constraint.
4776 // FIXME: We do not verify inline asm operands, but custom inline asm
4777 // verification is broken anyway
4778 if (ST.needsAlignedVGPRs()) {
4779 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4780 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4781 if (const TargetRegisterClass *SubRC =
4782 RI.getSubRegisterClass(RC, MO.getSubReg())) {
4783 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4784 if (RC)
4785 RC = SubRC;
4786 }
4787 }
4788
4789 // Check that this is the aligned version of the class.
4790 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4791 ErrInfo = "Subtarget requires even aligned vector registers";
4792 return false;
4793 }
4794 }
4795
4796 if (RegClass != -1) {
4797 if (Reg.isVirtual())
4798 continue;
4799
4800 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4801 if (!RC->contains(Reg)) {
4802 ErrInfo = "Operand has incorrect register class.";
4803 return false;
4804 }
4805 }
4806 }
4807
4808 // Verify SDWA
4809 if (isSDWA(MI)) {
4810 if (!ST.hasSDWA()) {
4811 ErrInfo = "SDWA is not supported on this target";
4812 return false;
4813 }
4814
4815 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4816
4817 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4818 if (OpIdx == -1)
4819 continue;
4820 const MachineOperand &MO = MI.getOperand(OpIdx);
4821
4822 if (!ST.hasSDWAScalar()) {
4823 // Only VGPRS on VI
4824 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4825 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4826 return false;
4827 }
4828 } else {
4829 // No immediates on GFX9
4830 if (!MO.isReg()) {
4831 ErrInfo =
4832 "Only reg allowed as operands in SDWA instructions on GFX9+";
4833 return false;
4834 }
4835 }
4836 }
4837
4838 if (!ST.hasSDWAOmod()) {
4839 // No omod allowed on VI
4840 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4841 if (OMod != nullptr &&
4842 (!OMod->isImm() || OMod->getImm() != 0)) {
4843 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4844 return false;
4845 }
4846 }
4847
4848 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4849 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4850 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4851 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4852 const MachineOperand *Src0ModsMO =
4853 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4854 unsigned Mods = Src0ModsMO->getImm();
4855 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4856 Mods & SISrcMods::SEXT) {
4857 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4858 return false;
4859 }
4860 }
4861
4862 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4863 if (isVOPC(BasicOpcode)) {
4864 if (!ST.hasSDWASdst() && DstIdx != -1) {
4865 // Only vcc allowed as dst on VI for VOPC
4866 const MachineOperand &Dst = MI.getOperand(DstIdx);
4867 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4868 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4869 return false;
4870 }
4871 } else if (!ST.hasSDWAOutModsVOPC()) {
4872 // No clamp allowed on GFX9 for VOPC
4873 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4874 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4875 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4876 return false;
4877 }
4878
4879 // No omod allowed on GFX9 for VOPC
4880 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4881 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4882 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4883 return false;
4884 }
4885 }
4886 }
4887
4888 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4889 if (DstUnused && DstUnused->isImm() &&
4890 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4891 const MachineOperand &Dst = MI.getOperand(DstIdx);
4892 if (!Dst.isReg() || !Dst.isTied()) {
4893 ErrInfo = "Dst register should have tied register";
4894 return false;
4895 }
4896
4897 const MachineOperand &TiedMO =
4898 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4899 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4900 ErrInfo =
4901 "Dst register should be tied to implicit use of preserved register";
4902 return false;
4903 }
4904 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
4905 ErrInfo = "Dst register should use same physical register as preserved";
4906 return false;
4907 }
4908 }
4909 }
4910
4911 // Verify MIMG / VIMAGE / VSAMPLE
4912 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4913 // Ensure that the return type used is large enough for all the options
4914 // being used TFE/LWE require an extra result register.
4915 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4916 if (DMask) {
4917 uint64_t DMaskImm = DMask->getImm();
4918 uint32_t RegCount =
4919 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
4920 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4921 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4922 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4923
4924 // Adjust for packed 16 bit values
4925 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4926 RegCount = divideCeil(RegCount, 2);
4927
4928 // Adjust if using LWE or TFE
4929 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4930 RegCount += 1;
4931
4932 const uint32_t DstIdx =
4933 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4934 const MachineOperand &Dst = MI.getOperand(DstIdx);
4935 if (Dst.isReg()) {
4936 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4937 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4938 if (RegCount > DstSize) {
4939 ErrInfo = "Image instruction returns too many registers for dst "
4940 "register class";
4941 return false;
4942 }
4943 }
4944 }
4945 }
4946
4947 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4948 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
4949 unsigned ConstantBusCount = 0;
4950 bool UsesLiteral = false;
4951 const MachineOperand *LiteralVal = nullptr;
4952
4953 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
4954 if (ImmIdx != -1) {
4955 ++ConstantBusCount;
4956 UsesLiteral = true;
4957 LiteralVal = &MI.getOperand(ImmIdx);
4958 }
4959
4960 SmallVector<Register, 2> SGPRsUsed;
4961 Register SGPRUsed;
4962
4963 // Only look at the true operands. Only a real operand can use the constant
4964 // bus, and we don't want to check pseudo-operands like the source modifier
4965 // flags.
4966 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
4967 if (OpIdx == -1)
4968 continue;
4969 const MachineOperand &MO = MI.getOperand(OpIdx);
4970 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4971 if (MO.isReg()) {
4972 SGPRUsed = MO.getReg();
4973 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
4974 ++ConstantBusCount;
4975 SGPRsUsed.push_back(SGPRUsed);
4976 }
4977 } else if (!MO.isFI()) { // Treat FI like a register.
4978 if (!UsesLiteral) {
4979 ++ConstantBusCount;
4980 UsesLiteral = true;
4981 LiteralVal = &MO;
4982 } else if (!MO.isIdenticalTo(*LiteralVal)) {
4983 assert(isVOP2(MI) || isVOP3(MI));
4984 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
4985 return false;
4986 }
4987 }
4988 }
4989 }
4990
4991 SGPRUsed = findImplicitSGPRRead(MI);
4992 if (SGPRUsed) {
4993 // Implicit uses may safely overlap true operands
4994 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4995 return !RI.regsOverlap(SGPRUsed, SGPR);
4996 })) {
4997 ++ConstantBusCount;
4998 SGPRsUsed.push_back(SGPRUsed);
4999 }
5000 }
5001
5002 // v_writelane_b32 is an exception from constant bus restriction:
5003 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5004 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5005 Opcode != AMDGPU::V_WRITELANE_B32) {
5006 ErrInfo = "VOP* instruction violates constant bus restriction";
5007 return false;
5008 }
5009
5010 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5011 ErrInfo = "VOP3 instruction uses literal";
5012 return false;
5013 }
5014 }
5015
5016 // Special case for writelane - this can break the multiple constant bus rule,
5017 // but still can't use more than one SGPR register
5018 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5019 unsigned SGPRCount = 0;
5020 Register SGPRUsed;
5021
5022 for (int OpIdx : {Src0Idx, Src1Idx}) {
5023 if (OpIdx == -1)
5024 break;
5025
5026 const MachineOperand &MO = MI.getOperand(OpIdx);
5027
5028 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5029 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5030 if (MO.getReg() != SGPRUsed)
5031 ++SGPRCount;
5032 SGPRUsed = MO.getReg();
5033 }
5034 }
5035 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5036 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5037 return false;
5038 }
5039 }
5040 }
5041
5042 // Verify misc. restrictions on specific instructions.
5043 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5044 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5045 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5046 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5047 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5048 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5049 if (!compareMachineOp(Src0, Src1) &&
5050 !compareMachineOp(Src0, Src2)) {
5051 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5052 return false;
5053 }
5054 }
5055 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5056 SISrcMods::ABS) ||
5057 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5058 SISrcMods::ABS) ||
5059 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5060 SISrcMods::ABS)) {
5061 ErrInfo = "ABS not allowed in VOP3B instructions";
5062 return false;
5063 }
5064 }
5065
5066 if (isSOP2(MI) || isSOPC(MI)) {
5067 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5068 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5069
5070 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5071 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5072 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5073 !Src0.isIdenticalTo(Src1)) {
5074 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5075 return false;
5076 }
5077 }
5078
5079 if (isSOPK(MI)) {
5080 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5081 if (Desc.isBranch()) {
5082 if (!Op->isMBB()) {
5083 ErrInfo = "invalid branch target for SOPK instruction";
5084 return false;
5085 }
5086 } else {
5087 uint64_t Imm = Op->getImm();
5088 if (sopkIsZext(Opcode)) {
5089 if (!isUInt<16>(Imm)) {
5090 ErrInfo = "invalid immediate for SOPK instruction";
5091 return false;
5092 }
5093 } else {
5094 if (!isInt<16>(Imm)) {
5095 ErrInfo = "invalid immediate for SOPK instruction";
5096 return false;
5097 }
5098 }
5099 }
5100 }
5101
5102 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5103 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5104 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5105 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5106 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5107 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5108
5109 const unsigned StaticNumOps =
5110 Desc.getNumOperands() + Desc.implicit_uses().size();
5111 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5112
5113 // Allow additional implicit operands. This allows a fixup done by the post
5114 // RA scheduler where the main implicit operand is killed and implicit-defs
5115 // are added for sub-registers that remain live after this instruction.
5116 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5117 ErrInfo = "missing implicit register operands";
5118 return false;
5119 }
5120
5121 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5122 if (IsDst) {
5123 if (!Dst->isUse()) {
5124 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5125 return false;
5126 }
5127
5128 unsigned UseOpIdx;
5129 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5130 UseOpIdx != StaticNumOps + 1) {
5131 ErrInfo = "movrel implicit operands should be tied";
5132 return false;
5133 }
5134 }
5135
5136 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5137 const MachineOperand &ImpUse
5138 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5139 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5140 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5141 ErrInfo = "src0 should be subreg of implicit vector use";
5142 return false;
5143 }
5144 }
5145
5146 // Make sure we aren't losing exec uses in the td files. This mostly requires
5147 // being careful when using let Uses to try to add other use registers.
5148 if (shouldReadExec(MI)) {
5149 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5150 ErrInfo = "VALU instruction does not implicitly read exec mask";
5151 return false;
5152 }
5153 }
5154
5155 if (isSMRD(MI)) {
5156 if (MI.mayStore() &&
5158 // The register offset form of scalar stores may only use m0 as the
5159 // soffset register.
5160 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5161 if (Soff && Soff->getReg() != AMDGPU::M0) {
5162 ErrInfo = "scalar stores must use m0 as offset register";
5163 return false;
5164 }
5165 }
5166 }
5167
5168 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5169 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5170 if (Offset->getImm() != 0) {
5171 ErrInfo = "subtarget does not support offsets in flat instructions";
5172 return false;
5173 }
5174 }
5175
5176 if (isDS(MI) && !ST.hasGDS()) {
5177 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5178 if (GDSOp && GDSOp->getImm() != 0) {
5179 ErrInfo = "GDS is not supported on this subtarget";
5180 return false;
5181 }
5182 }
5183
5184 if (isImage(MI)) {
5185 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5186 if (DimOp) {
5187 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5188 AMDGPU::OpName::vaddr0);
5189 int RSrcOpName =
5190 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5191 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5192 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5193 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5195 const AMDGPU::MIMGDimInfo *Dim =
5197
5198 if (!Dim) {
5199 ErrInfo = "dim is out of range";
5200 return false;
5201 }
5202
5203 bool IsA16 = false;
5204 if (ST.hasR128A16()) {
5205 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5206 IsA16 = R128A16->getImm() != 0;
5207 } else if (ST.hasA16()) {
5208 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5209 IsA16 = A16->getImm() != 0;
5210 }
5211
5212 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5213
5214 unsigned AddrWords =
5215 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5216
5217 unsigned VAddrWords;
5218 if (IsNSA) {
5219 VAddrWords = RsrcIdx - VAddr0Idx;
5220 if (ST.hasPartialNSAEncoding() &&
5221 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5222 unsigned LastVAddrIdx = RsrcIdx - 1;
5223 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5224 }
5225 } else {
5226 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5227 if (AddrWords > 12)
5228 AddrWords = 16;
5229 }
5230
5231 if (VAddrWords != AddrWords) {
5232 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5233 << " but got " << VAddrWords << "\n");
5234 ErrInfo = "bad vaddr size";
5235 return false;
5236 }
5237 }
5238 }
5239
5240 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5241 if (DppCt) {
5242 using namespace AMDGPU::DPP;
5243
5244 unsigned DC = DppCt->getImm();
5245 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5246 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5247 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5248 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5249 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5250 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5251 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5252 ErrInfo = "Invalid dpp_ctrl value";
5253 return false;
5254 }
5255 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5257 ErrInfo = "Invalid dpp_ctrl value: "
5258 "wavefront shifts are not supported on GFX10+";
5259 return false;
5260 }
5261 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5263 ErrInfo = "Invalid dpp_ctrl value: "
5264 "broadcasts are not supported on GFX10+";
5265 return false;
5266 }
5267 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5269 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5270 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5271 !ST.hasGFX90AInsts()) {
5272 ErrInfo = "Invalid dpp_ctrl value: "
5273 "row_newbroadcast/row_share is not supported before "
5274 "GFX90A/GFX10";
5275 return false;
5276 }
5277 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5278 ErrInfo = "Invalid dpp_ctrl value: "
5279 "row_share and row_xmask are not supported before GFX10";
5280 return false;
5281 }
5282 }
5283
5284 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5286 ErrInfo = "Invalid dpp_ctrl value: "
5287 "DP ALU dpp only support row_newbcast";
5288 return false;
5289 }
5290 }
5291
5292 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5293 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5294 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5295 : AMDGPU::OpName::vdata;
5296 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5297 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5298 if (Data && !Data->isReg())
5299 Data = nullptr;
5300
5301 if (ST.hasGFX90AInsts()) {
5302 if (Dst && Data &&
5303 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5304 ErrInfo = "Invalid register class: "
5305 "vdata and vdst should be both VGPR or AGPR";
5306 return false;
5307 }
5308 if (Data && Data2 &&
5309 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5310 ErrInfo = "Invalid register class: "
5311 "both data operands should be VGPR or AGPR";
5312 return false;
5313 }
5314 } else {
5315 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5316 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5317 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5318 ErrInfo = "Invalid register class: "
5319 "agpr loads and stores not supported on this GPU";
5320 return false;
5321 }
5322 }
5323 }
5324
5325 if (ST.needsAlignedVGPRs()) {
5326 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5328 if (!Op)
5329 return true;
5330 Register Reg = Op->getReg();
5331 if (Reg.isPhysical())
5332 return !(RI.getHWRegIndex(Reg) & 1);
5333 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5334 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5335 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5336 };
5337
5338 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5339 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5340 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5341
5342 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5343 ErrInfo = "Subtarget requires even aligned vector registers "
5344 "for DS_GWS instructions";
5345 return false;
5346 }
5347 }
5348
5349 if (isMIMG(MI)) {
5350 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5351 ErrInfo = "Subtarget requires even aligned vector registers "
5352 "for vaddr operand of image instructions";
5353 return false;
5354 }
5355 }
5356 }
5357
5358 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5359 !ST.hasGFX90AInsts()) {
5360 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5361 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5362 ErrInfo = "Invalid register class: "
5363 "v_accvgpr_write with an SGPR is not supported on this GPU";
5364 return false;
5365 }
5366 }
5367
5368 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5369 const MachineOperand &SrcOp = MI.getOperand(1);
5370 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5371 ErrInfo = "pseudo expects only physical SGPRs";
5372 return false;
5373 }
5374 }
5375
5376 return true;
5377}
5378
5379// It is more readable to list mapped opcodes on the same line.
5380// clang-format off
5381
5383 switch (MI.getOpcode()) {
5384 default: return AMDGPU::INSTRUCTION_LIST_END;
5385 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5386 case AMDGPU::COPY: return AMDGPU::COPY;
5387 case AMDGPU::PHI: return AMDGPU::PHI;
5388 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5389 case AMDGPU::WQM: return AMDGPU::WQM;
5390 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5391 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5392 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5393 case AMDGPU::S_MOV_B32: {
5394 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5395 return MI.getOperand(1).isReg() ||
5396 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5397 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5398 }
5399 case AMDGPU::S_ADD_I32:
5400 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5401 case AMDGPU::S_ADDC_U32:
5402 return AMDGPU::V_ADDC_U32_e32;
5403 case AMDGPU::S_SUB_I32:
5404 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5405 // FIXME: These are not consistently handled, and selected when the carry is
5406 // used.
5407 case AMDGPU::S_ADD_U32:
5408 return AMDGPU::V_ADD_CO_U32_e32;
5409 case AMDGPU::S_SUB_U32:
5410 return AMDGPU::V_SUB_CO_U32_e32;
5411 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5412 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5413 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5414 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5415 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5416 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5417 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5418 case AMDGPU::S_XNOR_B32:
5419 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5420 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5421 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5422 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5423 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5424 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5425 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5426 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5427 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5428 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5429 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5430 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5431 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5432 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5433 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5434 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5435 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5436 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5437 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5438 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5439 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5440 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5441 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5442 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5443 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5444 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5445 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5446 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5447 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5448 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5449 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5450 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5451 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5452 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5453 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5454 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5455 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5456 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5457 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5458 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5459 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5460 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5461 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5462 case AMDGPU::S_CVT_F32_F16:
5463 case AMDGPU::S_CVT_HI_F32_F16:
5464 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5465 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5466 case AMDGPU::S_CVT_F16_F32:
5467 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5468 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5469 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5470 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5471 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5472 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5473 case AMDGPU::S_CEIL_F16:
5474 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5475 : AMDGPU::V_CEIL_F16_fake16_e64;
5476 case AMDGPU::S_FLOOR_F16:
5477 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5478 : AMDGPU::V_FLOOR_F16_fake16_e64;
5479 case AMDGPU::S_TRUNC_F16:
5480 return AMDGPU::V_TRUNC_F16_fake16_e64;
5481 case AMDGPU::S_RNDNE_F16:
5482 return AMDGPU::V_RNDNE_F16_fake16_e64;
5483 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5484 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5485 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5486 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5487 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5488 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5489 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5490 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5491 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5492 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5493 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5494 case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5495 case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5496 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5497 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5498 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5499 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5500 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5501 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5502 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5503 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5504 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5505 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5506 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5507 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5508 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5509 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5510 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5511 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5512 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5513 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5514 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5515 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5516 case AMDGPU::S_CMP_LT_F16:
5517 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5518 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5519 case AMDGPU::S_CMP_EQ_F16:
5520 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5521 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5522 case AMDGPU::S_CMP_LE_F16:
5523 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5524 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5525 case AMDGPU::S_CMP_GT_F16:
5526 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5527 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5528 case AMDGPU::S_CMP_LG_F16:
5529 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5530 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5531 case AMDGPU::S_CMP_GE_F16:
5532 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5533 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5534 case AMDGPU::S_CMP_O_F16:
5535 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5536 : AMDGPU::V_CMP_O_F16_fake16_e64;
5537 case AMDGPU::S_CMP_U_F16:
5538 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5539 : AMDGPU::V_CMP_U_F16_fake16_e64;
5540 case AMDGPU::S_CMP_NGE_F16:
5541 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5542 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5543 case AMDGPU::S_CMP_NLG_F16:
5544 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5545 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5546 case AMDGPU::S_CMP_NGT_F16:
5547 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5548 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5549 case AMDGPU::S_CMP_NLE_F16:
5550 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5551 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5552 case AMDGPU::S_CMP_NEQ_F16:
5553 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5554 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5555 case AMDGPU::S_CMP_NLT_F16:
5556 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5557 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5558 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5559 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5560 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5561 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5562 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5563 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5564 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5565 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5566 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5567 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5568 }
5570 "Unexpected scalar opcode without corresponding vector one!");
5571}
5572
5573// clang-format on
5574
5578 const DebugLoc &DL, Register Reg,
5579 bool IsSCCLive,
5580 SlotIndexes *Indexes) const {
5581 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5582 const SIInstrInfo *TII = ST.getInstrInfo();
5583 bool IsWave32 = ST.isWave32();
5584 if (IsSCCLive) {
5585 // Insert two move instructions, one to save the original value of EXEC and
5586 // the other to turn on all bits in EXEC. This is required as we can't use
5587 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5588 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5589 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5590 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5591 .addReg(Exec, RegState::Kill);
5592 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5593 if (Indexes) {
5594 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5595 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5596 }
5597 } else {
5598 const unsigned OrSaveExec =
5599 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5600 auto SaveExec =
5601 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5602 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5603 if (Indexes)
5604 Indexes->insertMachineInstrInMaps(*SaveExec);
5605 }
5606}
5607
5610 const DebugLoc &DL, Register Reg,
5611 SlotIndexes *Indexes) const {
5612 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5613 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5614 auto ExecRestoreMI =
5615 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5616 if (Indexes)
5617 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5618}
5619
5620static const TargetRegisterClass *
5622 const MachineRegisterInfo &MRI,
5623 const MCInstrDesc &TID, unsigned RCID,
5624 bool IsAllocatable) {
5625 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5626 (((TID.mayLoad() || TID.mayStore()) &&
5627 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5629 switch (RCID) {
5630 case AMDGPU::AV_32RegClassID:
5631 RCID = AMDGPU::VGPR_32RegClassID;
5632 break;
5633 case AMDGPU::AV_64RegClassID:
5634 RCID = AMDGPU::VReg_64RegClassID;
5635 break;
5636 case AMDGPU::AV_96RegClassID:
5637 RCID = AMDGPU::VReg_96RegClassID;
5638 break;
5639 case AMDGPU::AV_128RegClassID:
5640 RCID = AMDGPU::VReg_128RegClassID;
5641 break;
5642 case AMDGPU::AV_160RegClassID:
5643 RCID = AMDGPU::VReg_160RegClassID;
5644 break;
5645 case AMDGPU::AV_512RegClassID:
5646 RCID = AMDGPU::VReg_512RegClassID;
5647 break;
5648 default:
5649 break;
5650 }
5651 }
5652
5653 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5654}
5655
5657 unsigned OpNum, const TargetRegisterInfo *TRI,
5658 const MachineFunction &MF)
5659 const {
5660 if (OpNum >= TID.getNumOperands())
5661 return nullptr;
5662 auto RegClass = TID.operands()[OpNum].RegClass;
5663 bool IsAllocatable = false;
5665 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5666 // with two data operands. Request register class constrained to VGPR only
5667 // of both operands present as Machine Copy Propagation can not check this
5668 // constraint and possibly other passes too.
5669 //
5670 // The check is limited to FLAT and DS because atomics in non-flat encoding
5671 // have their vdst and vdata tied to be the same register.
5672 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5673 AMDGPU::OpName::vdst);
5674 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5675 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5676 : AMDGPU::OpName::vdata);
5677 if (DataIdx != -1) {
5678 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5679 TID.Opcode, AMDGPU::OpName::data1);
5680 }
5681 }
5682 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5683 IsAllocatable);
5684}
5685
5687 unsigned OpNo) const {
5688 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5689 const MCInstrDesc &Desc = get(MI.getOpcode());
5690 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5691 Desc.operands()[OpNo].RegClass == -1) {
5692 Register Reg = MI.getOperand(OpNo).getReg();
5693
5694 if (Reg.isVirtual())
5695 return MRI.getRegClass(Reg);
5696 return RI.getPhysRegBaseClass(Reg);
5697 }
5698
5699 unsigned RCID = Desc.operands()[OpNo].RegClass;
5700 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5701}
5702
5705 MachineBasicBlock *MBB = MI.getParent();
5706 MachineOperand &MO = MI.getOperand(OpIdx);
5708 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5709 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5710 unsigned Size = RI.getRegSizeInBits(*RC);
5711 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
5712 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
5713 : AMDGPU::V_MOV_B32_e32;
5714 if (MO.isReg())
5715 Opcode = AMDGPU::COPY;
5716 else if (RI.isSGPRClass(RC))
5717 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5718
5719 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5720 Register Reg = MRI.createVirtualRegister(VRC);
5722 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5723 MO.ChangeToRegister(Reg, false);
5724}
5725
5728 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5729 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5730 if (!SuperReg.getReg().isVirtual())
5731 return RI.getSubReg(SuperReg.getReg(), SubIdx);
5732
5733 MachineBasicBlock *MBB = MI->getParent();
5734 DebugLoc DL = MI->getDebugLoc();
5735 Register SubReg = MRI.createVirtualRegister(SubRC);
5736
5737 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
5738 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5739 .addReg(SuperReg.getReg(), 0, NewSubIdx);
5740 return SubReg;
5741}
5742
5745 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5746 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5747 if (Op.isImm()) {
5748 if (SubIdx == AMDGPU::sub0)
5749 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5750 if (SubIdx == AMDGPU::sub1)
5751 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5752
5753 llvm_unreachable("Unhandled register index for immediate");
5754 }
5755
5756 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5757 SubIdx, SubRC);
5758 return MachineOperand::CreateReg(SubReg, false);
5759}
5760
5761// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5762void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5763 assert(Inst.getNumExplicitOperands() == 3);
5764 MachineOperand Op1 = Inst.getOperand(1);
5765 Inst.removeOperand(1);
5766 Inst.addOperand(Op1);
5767}
5768
5770 const MCOperandInfo &OpInfo,
5771 const MachineOperand &MO) const {
5772 if (!MO.isReg())
5773 return false;
5774
5775 Register Reg = MO.getReg();
5776
5777 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5778 if (Reg.isPhysical())
5779 return DRC->contains(Reg);
5780
5781 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5782
5783 if (MO.getSubReg()) {
5784 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5785 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5786 if (!SuperRC)
5787 return false;
5788
5789 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5790 if (!DRC)
5791 return false;
5792 }
5793 return RC->hasSuperClassEq(DRC);
5794}
5795
5797 const MCOperandInfo &OpInfo,
5798 const MachineOperand &MO) const {
5799 if (MO.isReg())
5800 return isLegalRegOperand(MRI, OpInfo, MO);
5801
5802 // Handle non-register types that are treated like immediates.
5803 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5804 return true;
5805}
5806
5807bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5808 const MachineOperand *MO) const {
5809 const MachineFunction &MF = *MI.getParent()->getParent();
5810 const MachineRegisterInfo &MRI = MF.getRegInfo();
5811 const MCInstrDesc &InstDesc = MI.getDesc();
5812 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5813 const TargetRegisterClass *DefinedRC =
5814 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5815 if (!MO)
5816 MO = &MI.getOperand(OpIdx);
5817
5818 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5819 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5820 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5821 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5822 return false;
5823
5825 if (MO->isReg())
5826 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5827
5828 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5829 if (i == OpIdx)
5830 continue;
5831 const MachineOperand &Op = MI.getOperand(i);
5832 if (Op.isReg()) {
5833 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5834 if (!SGPRsUsed.count(SGPR) &&
5835 // FIXME: This can access off the end of the operands() array.
5836 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5837 if (--ConstantBusLimit <= 0)
5838 return false;
5839 SGPRsUsed.insert(SGPR);
5840 }
5841 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5842 !isInlineConstant(Op, InstDesc.operands()[i])) {
5843 if (!LiteralLimit--)
5844 return false;
5845 if (--ConstantBusLimit <= 0)
5846 return false;
5847 }
5848 }
5849 } else if (ST.hasNoF16PseudoScalarTransInlineConstants() && !MO->isReg() &&
5850 isF16PseudoScalarTrans(MI.getOpcode()) &&
5851 isInlineConstant(*MO, OpInfo)) {
5852 return false;
5853 }
5854
5855 if (MO->isReg()) {
5856 if (!DefinedRC)
5857 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5858 if (!isLegalRegOperand(MRI, OpInfo, *MO))
5859 return false;
5860 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5861 if (IsAGPR && !ST.hasMAIInsts())
5862 return false;
5863 unsigned Opc = MI.getOpcode();
5864 if (IsAGPR &&
5865 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5866 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5867 return false;
5868 // Atomics should have both vdst and vdata either vgpr or agpr.
5869 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5870 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
5871 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5872 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5873 MI.getOperand(DataIdx).isReg() &&
5874 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5875 return false;
5876 if ((int)OpIdx == DataIdx) {
5877 if (VDstIdx != -1 &&
5878 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5879 return false;
5880 // DS instructions with 2 src operands also must have tied RC.
5881 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5882 AMDGPU::OpName::data1);
5883 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5884 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5885 return false;
5886 }
5887 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5888 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5889 RI.isSGPRReg(MRI, MO->getReg()))
5890 return false;
5891 return true;
5892 }
5893
5894 if (MO->isImm()) {
5895 uint64_t Imm = MO->getImm();
5896 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5897 bool Is64BitOp = Is64BitFPOp ||
5901 if (Is64BitOp &&
5903 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
5904 return false;
5905
5906 // FIXME: We can use sign extended 64-bit literals, but only for signed
5907 // operands. At the moment we do not know if an operand is signed.
5908 // Such operand will be encoded as its low 32 bits and then either
5909 // correctly sign extended or incorrectly zero extended by HW.
5910 if (!Is64BitFPOp && (int32_t)Imm < 0)
5911 return false;
5912 }
5913 }
5914
5915 // Handle non-register types that are treated like immediates.
5916 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
5917
5918 if (!DefinedRC) {
5919 // This operand expects an immediate.
5920 return true;
5921 }
5922
5923 return isImmOperandLegal(MI, OpIdx, *MO);
5924}
5925
5927 MachineInstr &MI) const {
5928 unsigned Opc = MI.getOpcode();
5929 const MCInstrDesc &InstrDesc = get(Opc);
5930
5931 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5932 MachineOperand &Src0 = MI.getOperand(Src0Idx);
5933
5934 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5935 MachineOperand &Src1 = MI.getOperand(Src1Idx);
5936
5937 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5938 // we need to only have one constant bus use before GFX10.
5939 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
5940 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
5941 RI.isSGPRReg(MRI, Src0.getReg()))
5942 legalizeOpWithMove(MI, Src0Idx);
5943
5944 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5945 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5946 // src0/src1 with V_READFIRSTLANE.
5947 if (Opc == AMDGPU::V_WRITELANE_B32) {
5948 const DebugLoc &DL = MI.getDebugLoc();
5949 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
5950 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5951 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5952 .add(Src0);
5953 Src0.ChangeToRegister(Reg, false);
5954 }
5955 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
5956 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5957 const DebugLoc &DL = MI.getDebugLoc();
5958 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5959 .add(Src1);
5960 Src1.ChangeToRegister(Reg, false);
5961 }
5962 return;
5963 }
5964
5965 // No VOP2 instructions support AGPRs.
5966 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
5967 legalizeOpWithMove(MI, Src0Idx);
5968
5969 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
5970 legalizeOpWithMove(MI, Src1Idx);
5971
5972 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
5973 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
5974 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5975 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
5976 legalizeOpWithMove(MI, Src2Idx);
5977 }
5978
5979 // VOP2 src0 instructions support all operand types, so we don't need to check
5980 // their legality. If src1 is already legal, we don't need to do anything.
5981 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
5982 return;
5983
5984 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
5985 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
5986 // select is uniform.
5987 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
5988 RI.isVGPR(MRI, Src1.getReg())) {
5989 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5990 const DebugLoc &DL = MI.getDebugLoc();
5991 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5992 .add(Src1);
5993 Src1.ChangeToRegister(Reg, false);
5994 return;
5995 }
5996
5997 // We do not use commuteInstruction here because it is too aggressive and will
5998 // commute if it is possible. We only want to commute here if it improves
5999 // legality. This can be called a fairly large number of times so don't waste
6000 // compile time pointlessly swapping and checking legality again.
6001 if (HasImplicitSGPR || !MI.isCommutable()) {
6002 legalizeOpWithMove(MI, Src1Idx);
6003 return;
6004 }
6005
6006 // If src0 can be used as src1, commuting will make the operands legal.
6007 // Otherwise we have to give up and insert a move.
6008 //
6009 // TODO: Other immediate-like operand kinds could be commuted if there was a
6010 // MachineOperand::ChangeTo* for them.
6011 if ((!Src1.isImm() && !Src1.isReg()) ||
6012 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6013 legalizeOpWithMove(MI, Src1Idx);
6014 return;
6015 }
6016
6017 int CommutedOpc = commuteOpcode(MI);
6018 if (CommutedOpc == -1) {
6019 legalizeOpWithMove(MI, Src1Idx);
6020 return;
6021 }
6022
6023 MI.setDesc(get(CommutedOpc));
6024
6025 Register Src0Reg = Src0.getReg();
6026 unsigned Src0SubReg = Src0.getSubReg();
6027 bool Src0Kill = Src0.isKill();
6028
6029 if (Src1.isImm())
6030 Src0.ChangeToImmediate(Src1.getImm());
6031 else if (Src1.isReg()) {
6032 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6033 Src0.setSubReg(Src1.getSubReg());
6034 } else
6035 llvm_unreachable("Should only have register or immediate operands");
6036
6037 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6038 Src1.setSubReg(Src0SubReg);
6040}
6041
6042// Legalize VOP3 operands. All operand types are supported for any operand
6043// but only one literal constant and only starting from GFX10.
6045 MachineInstr &MI) const {
6046 unsigned Opc = MI.getOpcode();
6047
6048 int VOP3Idx[3] = {
6049 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6050 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6051 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6052 };
6053
6054 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6055 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
6056 // src1 and src2 must be scalar
6057 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6058 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6059 const DebugLoc &DL = MI.getDebugLoc();
6060 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6061 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6062 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6063 .add(Src1);
6064 Src1.ChangeToRegister(Reg, false);
6065 }
6066 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6067 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6068 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6069 .add(Src2);
6070 Src2.ChangeToRegister(Reg, false);
6071 }
6072 }
6073
6074 // Find the one SGPR operand we are allowed to use.
6075 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6076 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6077 SmallDenseSet<unsigned> SGPRsUsed;
6078 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6079 if (SGPRReg) {
6080 SGPRsUsed.insert(SGPRReg);
6081 --ConstantBusLimit;
6082 }
6083
6084 for (int Idx : VOP3Idx) {
6085 if (Idx == -1)
6086 break;
6087 MachineOperand &MO = MI.getOperand(Idx);
6088
6089 if (!MO.isReg()) {
6090 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6091 continue;
6092
6093 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6094 --LiteralLimit;
6095 --ConstantBusLimit;
6096 continue;
6097 }
6098
6099 --LiteralLimit;
6100 --ConstantBusLimit;
6102 continue;
6103 }
6104
6105 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6106 !isOperandLegal(MI, Idx, &MO)) {
6108 continue;
6109 }
6110
6111 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6112 continue; // VGPRs are legal
6113
6114 // We can use one SGPR in each VOP3 instruction prior to GFX10
6115 // and two starting from GFX10.
6116 if (SGPRsUsed.count(MO.getReg()))
6117 continue;
6118 if (ConstantBusLimit > 0) {
6119 SGPRsUsed.insert(MO.getReg());
6120 --ConstantBusLimit;
6121 continue;
6122 }
6123
6124 // If we make it this far, then the operand is not legal and we must
6125 // legalize it.
6127 }
6128
6129 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6130 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6131 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6132 legalizeOpWithMove(MI, VOP3Idx[2]);
6133}
6134
6137 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6138 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6139 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6140 if (DstRC)
6141 SRC = RI.getCommonSubClass(SRC, DstRC);
6142
6143 Register DstReg = MRI.createVirtualRegister(SRC);
6144 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6145
6146 if (RI.hasAGPRs(VRC)) {
6147 VRC = RI.getEquivalentVGPRClass(VRC);
6148 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6149 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6150 get(TargetOpcode::COPY), NewSrcReg)
6151 .addReg(SrcReg);
6152 SrcReg = NewSrcReg;
6153 }
6154
6155 if (SubRegs == 1) {
6156 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6157 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6158 .addReg(SrcReg);
6159 return DstReg;
6160 }
6161
6163 for (unsigned i = 0; i < SubRegs; ++i) {
6164 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6165 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6166 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6167 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6168 SRegs.push_back(SGPR);
6169 }
6170
6172 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6173 get(AMDGPU::REG_SEQUENCE), DstReg);
6174 for (unsigned i = 0; i < SubRegs; ++i) {
6175 MIB.addReg(SRegs[i]);
6176 MIB.addImm(RI.getSubRegFromChannel(i));
6177 }
6178 return DstReg;
6179}
6180
6182 MachineInstr &MI) const {
6183
6184 // If the pointer is store in VGPRs, then we need to move them to
6185 // SGPRs using v_readfirstlane. This is safe because we only select
6186 // loads with uniform pointers to SMRD instruction so we know the
6187 // pointer value is uniform.
6188 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6189 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6190 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6191 SBase->setReg(SGPR);
6192 }
6193 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6194 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6195 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6196 SOff->setReg(SGPR);
6197 }
6198}
6199
6201 unsigned Opc = Inst.getOpcode();
6202 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6203 if (OldSAddrIdx < 0)
6204 return false;
6205
6207
6208 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6209 if (NewOpc < 0)
6211 if (NewOpc < 0)
6212 return false;
6213
6215 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6216 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6217 return false;
6218
6219 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6220 if (NewVAddrIdx < 0)
6221 return false;
6222
6223 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6224
6225 // Check vaddr, it shall be zero or absent.
6226 MachineInstr *VAddrDef = nullptr;
6227 if (OldVAddrIdx >= 0) {
6228 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6229 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6230 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6231 !VAddrDef->getOperand(1).isImm() ||
6232 VAddrDef->getOperand(1).getImm() != 0)
6233 return false;
6234 }
6235
6236 const MCInstrDesc &NewDesc = get(NewOpc);
6237 Inst.setDesc(NewDesc);
6238
6239 // Callers expect iterator to be valid after this call, so modify the
6240 // instruction in place.
6241 if (OldVAddrIdx == NewVAddrIdx) {
6242 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6243 // Clear use list from the old vaddr holding a zero register.
6244 MRI.removeRegOperandFromUseList(&NewVAddr);
6245 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6246 Inst.removeOperand(OldSAddrIdx);
6247 // Update the use list with the pointer we have just moved from vaddr to
6248 // saddr position. Otherwise new vaddr will be missing from the use list.
6249 MRI.removeRegOperandFromUseList(&NewVAddr);
6250 MRI.addRegOperandToUseList(&NewVAddr);
6251 } else {
6252 assert(OldSAddrIdx == NewVAddrIdx);
6253
6254 if (OldVAddrIdx >= 0) {
6255 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6256 AMDGPU::OpName::vdst_in);
6257
6258 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6259 // it asserts. Untie the operands for now and retie them afterwards.
6260 if (NewVDstIn != -1) {
6261 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6262 Inst.untieRegOperand(OldVDstIn);
6263 }
6264
6265 Inst.removeOperand(OldVAddrIdx);
6266
6267 if (NewVDstIn != -1) {
6268 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6269 Inst.tieOperands(NewVDst, NewVDstIn);
6270 }
6271 }
6272 }
6273
6274 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6275 VAddrDef->eraseFromParent();
6276
6277 return true;
6278}
6279
6280// FIXME: Remove this when SelectionDAG is obsoleted.
6282 MachineInstr &MI) const {
6284 return;
6285
6286 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6287 // thinks they are uniform, so a readfirstlane should be valid.
6288 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6289 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6290 return;
6291
6293 return;
6294
6295 const TargetRegisterClass *DeclaredRC = getRegClass(
6296 MI.getDesc(), SAddr->getOperandNo(), &RI, *MI.getParent()->getParent());
6297
6298 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6299 SAddr->setReg(ToSGPR);
6300}
6301
6304 const TargetRegisterClass *DstRC,
6307 const DebugLoc &DL) const {
6308 Register OpReg = Op.getReg();
6309 unsigned OpSubReg = Op.getSubReg();
6310
6311 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6312 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6313
6314 // Check if operand is already the correct register class.
6315 if (DstRC == OpRC)
6316 return;
6317
6318 Register DstReg = MRI.createVirtualRegister(DstRC);
6319 auto Copy =
6320 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6321 Op.setReg(DstReg);
6322
6323 MachineInstr *Def = MRI.getVRegDef(OpReg);
6324 if (!Def)
6325 return;
6326
6327 // Try to eliminate the copy if it is copying an immediate value.
6328 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6329 foldImmediate(*Copy, *Def, OpReg, &MRI);
6330
6331 bool ImpDef = Def->isImplicitDef();
6332 while (!ImpDef && Def && Def->isCopy()) {
6333 if (Def->getOperand(1).getReg().isPhysical())
6334 break;
6335 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6336 ImpDef = Def && Def->isImplicitDef();
6337 }
6338 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6339 !ImpDef)
6340 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6341}
6342
6343// Emit the actual waterfall loop, executing the wrapped instruction for each
6344// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6345// iteration, in the worst case we execute 64 (once per lane).
6346static void
6349 MachineBasicBlock &LoopBB,
6350 MachineBasicBlock &BodyBB,
6351 const DebugLoc &DL,
6352 ArrayRef<MachineOperand *> ScalarOps) {
6353 MachineFunction &MF = *LoopBB.getParent();
6354 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6355 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6356 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6357 unsigned SaveExecOpc =
6358 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6359 unsigned XorTermOpc =
6360 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6361 unsigned AndOpc =
6362 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6363 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6364
6366 Register CondReg;
6367
6368 for (MachineOperand *ScalarOp : ScalarOps) {
6369 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6370 unsigned NumSubRegs = RegSize / 32;
6371 Register VScalarOp = ScalarOp->getReg();
6372
6373 if (NumSubRegs == 1) {
6374 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6375
6376 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6377 .addReg(VScalarOp);
6378
6379 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6380
6381 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6382 .addReg(CurReg)
6383 .addReg(VScalarOp);
6384
6385 // Combine the comparison results with AND.
6386 if (!CondReg) // First.
6387 CondReg = NewCondReg;
6388 else { // If not the first, we create an AND.
6389 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6390 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6391 .addReg(CondReg)
6392 .addReg(NewCondReg);
6393 CondReg = AndReg;
6394 }
6395
6396 // Update ScalarOp operand to use the SGPR ScalarOp.
6397 ScalarOp->setReg(CurReg);
6398 ScalarOp->setIsKill();
6399 } else {
6400 SmallVector<Register, 8> ReadlanePieces;
6401 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6402 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6403 "Unhandled register size");
6404
6405 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6406 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6407 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6408
6409 // Read the next variant <- also loop target.
6410 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6411 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6412
6413 // Read the next variant <- also loop target.
6414 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6415 .addReg(VScalarOp, VScalarOpUndef,
6416 TRI->getSubRegFromChannel(Idx + 1));
6417
6418 ReadlanePieces.push_back(CurRegLo);
6419 ReadlanePieces.push_back(CurRegHi);
6420
6421 // Comparison is to be done as 64-bit.
6422 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6423 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6424 .addReg(CurRegLo)
6425 .addImm(AMDGPU::sub0)
6426 .addReg(CurRegHi)
6427 .addImm(AMDGPU::sub1);
6428
6429 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6430 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6431 NewCondReg)
6432 .addReg(CurReg);
6433 if (NumSubRegs <= 2)
6434 Cmp.addReg(VScalarOp);
6435 else
6436 Cmp.addReg(VScalarOp, VScalarOpUndef,
6437 TRI->getSubRegFromChannel(Idx, 2));
6438
6439 // Combine the comparison results with AND.
6440 if (!CondReg) // First.
6441 CondReg = NewCondReg;
6442 else { // If not the first, we create an AND.
6443 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6444 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6445 .addReg(CondReg)
6446 .addReg(NewCondReg);
6447 CondReg = AndReg;
6448 }
6449 } // End for loop.
6450
6451 const auto *SScalarOpRC =
6452 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6453 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6454
6455 // Build scalar ScalarOp.
6456 auto Merge =
6457 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6458 unsigned Channel = 0;
6459 for (Register Piece : ReadlanePieces) {
6460 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6461 }
6462
6463 // Update ScalarOp operand to use the SGPR ScalarOp.
6464 ScalarOp->setReg(SScalarOp);
6465 ScalarOp->setIsKill();
6466 }
6467 }
6468
6469 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6470 MRI.setSimpleHint(SaveExec, CondReg);
6471
6472 // Update EXEC to matching lanes, saving original to SaveExec.
6473 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6474 .addReg(CondReg, RegState::Kill);
6475
6476 // The original instruction is here; we insert the terminators after it.
6477 I = BodyBB.end();
6478
6479 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6480 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6481 .addReg(Exec)
6482 .addReg(SaveExec);
6483
6484 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6485}
6486
6487// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6488// with SGPRs by iterating over all unique values across all lanes.
6489// Returns the loop basic block that now contains \p MI.
6490static MachineBasicBlock *
6494 MachineBasicBlock::iterator Begin = nullptr,
6495 MachineBasicBlock::iterator End = nullptr) {
6496 MachineBasicBlock &MBB = *MI.getParent();
6497 MachineFunction &MF = *MBB.getParent();
6498 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6499 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6501 if (!Begin.isValid())
6502 Begin = &MI;
6503 if (!End.isValid()) {
6504 End = &MI;
6505 ++End;
6506 }
6507 const DebugLoc &DL = MI.getDebugLoc();
6508 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6509 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6510 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6511
6512 // Save SCC. Waterfall Loop may overwrite SCC.
6513 Register SaveSCCReg;
6514
6515 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6516 // rather than unlimited scan everywhere
6517 bool SCCNotDead =
6518 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6519 std::numeric_limits<unsigned>::max()) !=
6521 if (SCCNotDead) {
6522 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6523 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6524 .addImm(1)
6525 .addImm(0);
6526 }
6527
6528 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6529
6530 // Save the EXEC mask
6531 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6532
6533 // Killed uses in the instruction we are waterfalling around will be
6534 // incorrect due to the added control-flow.
6536 ++AfterMI;
6537 for (auto I = Begin; I != AfterMI; I++) {
6538 for (auto &MO : I->all_uses())
6539 MRI.clearKillFlags(MO.getReg());
6540 }
6541
6542 // To insert the loop we need to split the block. Move everything after this
6543 // point to a new block, and insert a new empty block between the two.
6546 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6548 ++MBBI;
6549
6550 MF.insert(MBBI, LoopBB);
6551 MF.insert(MBBI, BodyBB);
6552 MF.insert(MBBI, RemainderBB);
6553
6554 LoopBB->addSuccessor(BodyBB);
6555 BodyBB->addSuccessor(LoopBB);
6556 BodyBB->addSuccessor(RemainderBB);
6557
6558 // Move Begin to MI to the BodyBB, and the remainder of the block to
6559 // RemainderBB.
6560 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6561 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6562 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6563
6564 MBB.addSuccessor(LoopBB);
6565
6566 // Update dominators. We know that MBB immediately dominates LoopBB, that
6567 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6568 // RemainderBB. RemainderBB immediately dominates all of the successors
6569 // transferred to it from MBB that MBB used to properly dominate.
6570 if (MDT) {
6571 MDT->addNewBlock(LoopBB, &MBB);
6572 MDT->addNewBlock(BodyBB, LoopBB);
6573 MDT->addNewBlock(RemainderBB, BodyBB);
6574 for (auto &Succ : RemainderBB->successors()) {
6575 if (MDT->properlyDominates(&MBB, Succ)) {
6576 MDT->changeImmediateDominator(Succ, RemainderBB);
6577 }
6578 }
6579 }
6580
6581 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
6582
6583 MachineBasicBlock::iterator First = RemainderBB->begin();
6584 // Restore SCC
6585 if (SCCNotDead) {
6586 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6587 .addReg(SaveSCCReg, RegState::Kill)
6588 .addImm(0);
6589 }
6590
6591 // Restore the EXEC mask
6592 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6593 return BodyBB;
6594}
6595
6596// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6597static std::tuple<unsigned, unsigned>
6599 MachineBasicBlock &MBB = *MI.getParent();
6600 MachineFunction &MF = *MBB.getParent();
6602
6603 // Extract the ptr from the resource descriptor.
6604 unsigned RsrcPtr =
6605 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6606 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6607
6608 // Create an empty resource descriptor
6609 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6610 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6611 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6612 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6613 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6614
6615 // Zero64 = 0
6616 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6617 .addImm(0);
6618
6619 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6620 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6621 .addImm(Lo_32(RsrcDataFormat));
6622
6623 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6624 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6625 .addImm(Hi_32(RsrcDataFormat));
6626
6627 // NewSRsrc = {Zero64, SRsrcFormat}
6628 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6629 .addReg(Zero64)
6630 .addImm(AMDGPU::sub0_sub1)
6631 .addReg(SRsrcFormatLo)
6632 .addImm(AMDGPU::sub2)
6633 .addReg(SRsrcFormatHi)
6634 .addImm(AMDGPU::sub3);
6635
6636 return std::tuple(RsrcPtr, NewSRsrc);
6637}
6638
6641 MachineDominatorTree *MDT) const {
6642 MachineFunction &MF = *MI.getParent()->getParent();
6644 MachineBasicBlock *CreatedBB = nullptr;
6645
6646 // Legalize VOP2
6647 if (isVOP2(MI) || isVOPC(MI)) {
6649 return CreatedBB;
6650 }
6651
6652 // Legalize VOP3
6653 if (isVOP3(MI)) {
6655 return CreatedBB;
6656 }
6657
6658 // Legalize SMRD
6659 if (isSMRD(MI)) {
6661 return CreatedBB;
6662 }
6663
6664 // Legalize FLAT
6665 if (isFLAT(MI)) {
6667 return CreatedBB;
6668 }
6669
6670 // Legalize REG_SEQUENCE and PHI
6671 // The register class of the operands much be the same type as the register
6672 // class of the output.
6673 if (MI.getOpcode() == AMDGPU::PHI) {
6674 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6675 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6676 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6677 continue;
6678 const TargetRegisterClass *OpRC =
6679 MRI.getRegClass(MI.getOperand(i).getReg());
6680 if (RI.hasVectorRegisters(OpRC)) {
6681 VRC = OpRC;
6682 } else {
6683 SRC = OpRC;
6684 }
6685 }
6686
6687 // If any of the operands are VGPR registers, then they all most be
6688 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6689 // them.
6690 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6691 if (!VRC) {
6692 assert(SRC);
6693 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6694 VRC = &AMDGPU::VReg_1RegClass;
6695 } else
6696 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6697 ? RI.getEquivalentAGPRClass(SRC)
6698 : RI.getEquivalentVGPRClass(SRC);
6699 } else {
6700 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6701 ? RI.getEquivalentAGPRClass(VRC)
6702 : RI.getEquivalentVGPRClass(VRC);
6703 }
6704 RC = VRC;
6705 } else {
6706 RC = SRC;
6707 }
6708
6709 // Update all the operands so they have the same type.
6710 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6711 MachineOperand &Op = MI.getOperand(I);
6712 if (!Op.isReg() || !Op.getReg().isVirtual())
6713 continue;
6714
6715 // MI is a PHI instruction.
6716 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6718
6719 // Avoid creating no-op copies with the same src and dst reg class. These
6720 // confuse some of the machine passes.
6721 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6722 }
6723 }
6724
6725 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6726 // VGPR dest type and SGPR sources, insert copies so all operands are
6727 // VGPRs. This seems to help operand folding / the register coalescer.
6728 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6729 MachineBasicBlock *MBB = MI.getParent();
6730 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6731 if (RI.hasVGPRs(DstRC)) {
6732 // Update all the operands so they are VGPR register classes. These may
6733 // not be the same register class because REG_SEQUENCE supports mixing
6734 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6735 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6736 MachineOperand &Op = MI.getOperand(I);
6737 if (!Op.isReg() || !Op.getReg().isVirtual())
6738 continue;
6739
6740 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6741 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6742 if (VRC == OpRC)
6743 continue;
6744
6745 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6746 Op.setIsKill();
6747 }
6748 }
6749
6750 return CreatedBB;
6751 }
6752
6753 // Legalize INSERT_SUBREG
6754 // src0 must have the same register class as dst
6755 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6756 Register Dst = MI.getOperand(0).getReg();
6757 Register Src0 = MI.getOperand(1).getReg();
6758 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6759 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6760 if (DstRC != Src0RC) {
6761 MachineBasicBlock *MBB = MI.getParent();
6762 MachineOperand &Op = MI.getOperand(1);
6763 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6764 }
6765 return CreatedBB;
6766 }
6767
6768 // Legalize SI_INIT_M0
6769 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6770 MachineOperand &Src = MI.getOperand(0);
6771 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6772 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6773 return CreatedBB;
6774 }
6775
6776 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6777 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6778 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6779 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6780 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6781 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
6782 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
6783 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
6784 MachineOperand &Src = MI.getOperand(1);
6785 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6786 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6787 return CreatedBB;
6788 }
6789
6790 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6791 //
6792 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6793 // scratch memory access. In both cases, the legalization never involves
6794 // conversion to the addr64 form.
6796 (isMUBUF(MI) || isMTBUF(MI)))) {
6797 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6798 : AMDGPU::OpName::srsrc;
6799 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6800 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6801 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6802
6803 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6804 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6805 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6806 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6807
6808 return CreatedBB;
6809 }
6810
6811 // Legalize SI_CALL
6812 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6813 MachineOperand *Dest = &MI.getOperand(0);
6814 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6815 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6816 // following copies, we also need to move copies from and to physical
6817 // registers into the loop block.
6818 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6819 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6820
6821 // Also move the copies to physical registers into the loop block
6822 MachineBasicBlock &MBB = *MI.getParent();
6824 while (Start->getOpcode() != FrameSetupOpcode)
6825 --Start;
6827 while (End->getOpcode() != FrameDestroyOpcode)
6828 ++End;
6829 // Also include following copies of the return value
6830 ++End;
6831 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6832 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
6833 ++End;
6834 CreatedBB =
6835 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6836 }
6837 }
6838
6839 // Legalize s_sleep_var.
6840 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6841 const DebugLoc &DL = MI.getDebugLoc();
6842 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6843 int Src0Idx =
6844 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6845 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6846 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6847 .add(Src0);
6848 Src0.ChangeToRegister(Reg, false);
6849 return nullptr;
6850 }
6851
6852 // Legalize MUBUF instructions.
6853 bool isSoffsetLegal = true;
6854 int SoffsetIdx =
6855 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6856 if (SoffsetIdx != -1) {
6857 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6858 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6859 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6860 isSoffsetLegal = false;
6861 }
6862 }
6863
6864 bool isRsrcLegal = true;
6865 int RsrcIdx =
6866 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6867 if (RsrcIdx != -1) {
6868 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6869 if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
6870 isRsrcLegal = false;
6871 }
6872 }
6873
6874 // The operands are legal.
6875 if (isRsrcLegal && isSoffsetLegal)
6876 return CreatedBB;
6877
6878 if (!isRsrcLegal) {
6879 // Legalize a VGPR Rsrc
6880 //
6881 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6882 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6883 // a zero-value SRsrc.
6884 //
6885 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6886 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6887 // above.
6888 //
6889 // Otherwise we are on non-ADDR64 hardware, and/or we have
6890 // idxen/offen/bothen and we fall back to a waterfall loop.
6891
6892 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6893 MachineBasicBlock &MBB = *MI.getParent();
6894
6895 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6896 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6897 // This is already an ADDR64 instruction so we need to add the pointer
6898 // extracted from the resource descriptor to the current value of VAddr.
6899 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6900 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6901 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6902
6903 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
6904 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
6905 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
6906
6907 unsigned RsrcPtr, NewSRsrc;
6908 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6909
6910 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
6911 const DebugLoc &DL = MI.getDebugLoc();
6912 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
6913 .addDef(CondReg0)
6914 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6915 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
6916 .addImm(0);
6917
6918 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
6919 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
6920 .addDef(CondReg1, RegState::Dead)
6921 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6922 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
6923 .addReg(CondReg0, RegState::Kill)
6924 .addImm(0);
6925
6926 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6927 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
6928 .addReg(NewVAddrLo)
6929 .addImm(AMDGPU::sub0)
6930 .addReg(NewVAddrHi)
6931 .addImm(AMDGPU::sub1);
6932
6933 VAddr->setReg(NewVAddr);
6934 Rsrc->setReg(NewSRsrc);
6935 } else if (!VAddr && ST.hasAddr64()) {
6936 // This instructions is the _OFFSET variant, so we need to convert it to
6937 // ADDR64.
6939 "FIXME: Need to emit flat atomics here");
6940
6941 unsigned RsrcPtr, NewSRsrc;
6942 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6943
6944 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6945 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
6946 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
6947 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6948 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
6949
6950 // Atomics with return have an additional tied operand and are
6951 // missing some of the special bits.
6952 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
6953 MachineInstr *Addr64;
6954
6955 if (!VDataIn) {
6956 // Regular buffer load / store.
6958 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6959 .add(*VData)
6960 .addReg(NewVAddr)
6961 .addReg(NewSRsrc)
6962 .add(*SOffset)
6963 .add(*Offset);
6964
6965 if (const MachineOperand *CPol =
6966 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6967 MIB.addImm(CPol->getImm());
6968 }
6969
6970 if (const MachineOperand *TFE =
6971 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
6972 MIB.addImm(TFE->getImm());
6973 }
6974
6975 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
6976
6977 MIB.cloneMemRefs(MI);
6978 Addr64 = MIB;
6979 } else {
6980 // Atomics with return.
6981 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6982 .add(*VData)
6983 .add(*VDataIn)
6984 .addReg(NewVAddr)
6985 .addReg(NewSRsrc)
6986 .add(*SOffset)
6987 .add(*Offset)
6988 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
6989 .cloneMemRefs(MI);
6990 }
6991
6992 MI.removeFromParent();
6993
6994 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6995 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
6996 NewVAddr)
6997 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6998 .addImm(AMDGPU::sub0)
6999 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7000 .addImm(AMDGPU::sub1);
7001 } else {
7002 // Legalize a VGPR Rsrc and soffset together.
7003 if (!isSoffsetLegal) {
7004 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7005 CreatedBB =
7006 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7007 return CreatedBB;
7008 }
7009 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7010 return CreatedBB;
7011 }
7012 }
7013
7014 // Legalize a VGPR soffset.
7015 if (!isSoffsetLegal) {
7016 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7017 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7018 return CreatedBB;
7019 }
7020 return CreatedBB;
7021}
7022
7024 InstrList.insert(MI);
7025 // Add MBUF instructiosn to deferred list.
7026 int RsrcIdx =
7027 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7028 if (RsrcIdx != -1) {
7029 DeferredList.insert(MI);
7030 }
7031}
7032
7034 return DeferredList.contains(MI);
7035}
7036
7038 MachineDominatorTree *MDT) const {
7039
7040 while (!Worklist.empty()) {
7041 MachineInstr &Inst = *Worklist.top();
7042 Worklist.erase_top();
7043 // Skip MachineInstr in the deferred list.
7044 if (Worklist.isDeferred(&Inst))
7045 continue;
7046 moveToVALUImpl(Worklist, MDT, Inst);
7047 }
7048
7049 // Deferred list of instructions will be processed once
7050 // all the MachineInstr in the worklist are done.
7051 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7052 moveToVALUImpl(Worklist, MDT, *Inst);
7053 assert(Worklist.empty() &&
7054 "Deferred MachineInstr are not supposed to re-populate worklist");
7055 }
7056}
7057
7060 MachineInstr &Inst) const {
7061
7063 if (!MBB)
7064 return;
7066 unsigned Opcode = Inst.getOpcode();
7067 unsigned NewOpcode = getVALUOp(Inst);
7068 // Handle some special cases
7069 switch (Opcode) {
7070 default:
7071 break;
7072 case AMDGPU::S_ADD_U64_PSEUDO:
7073 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
7074 break;
7075 case AMDGPU::S_SUB_U64_PSEUDO:
7076 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
7077 break;
7078 case AMDGPU::S_ADD_I32:
7079 case AMDGPU::S_SUB_I32: {
7080 // FIXME: The u32 versions currently selected use the carry.
7081 bool Changed;
7082 MachineBasicBlock *CreatedBBTmp = nullptr;
7083 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7084 if (Changed)
7085 return;
7086
7087 // Default handling
7088 break;
7089 }
7090
7091 case AMDGPU::S_MUL_U64:
7092 // Split s_mul_u64 in 32-bit vector multiplications.
7093 splitScalarSMulU64(Worklist, Inst, MDT);
7094 Inst.eraseFromParent();
7095 return;
7096
7097 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7098 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7099 // This is a special case of s_mul_u64 where all the operands are either
7100 // zero extended or sign extended.
7101 splitScalarSMulPseudo(Worklist, Inst, MDT);
7102 Inst.eraseFromParent();
7103 return;
7104
7105 case AMDGPU::S_AND_B64:
7106 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7107 Inst.eraseFromParent();
7108 return;
7109
7110 case AMDGPU::S_OR_B64:
7111 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7112 Inst.eraseFromParent();
7113 return;
7114
7115 case AMDGPU::S_XOR_B64:
7116 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7117 Inst.eraseFromParent();
7118 return;
7119
7120 case AMDGPU::S_NAND_B64:
7121 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7122 Inst.eraseFromParent();
7123 return;
7124
7125 case AMDGPU::S_NOR_B64:
7126 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7127 Inst.eraseFromParent();
7128 return;
7129
7130 case AMDGPU::S_XNOR_B64:
7131 if (ST.hasDLInsts())
7132 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7133 else
7134 splitScalar64BitXnor(Worklist, Inst, MDT);
7135 Inst.eraseFromParent();
7136 return;
7137
7138 case AMDGPU::S_ANDN2_B64:
7139 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7140 Inst.eraseFromParent();
7141 return;
7142
7143 case AMDGPU::S_ORN2_B64:
7144 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7145 Inst.eraseFromParent();
7146 return;
7147
7148 case AMDGPU::S_BREV_B64:
7149 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7150 Inst.eraseFromParent();
7151 return;
7152
7153 case AMDGPU::S_NOT_B64:
7154 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7155 Inst.eraseFromParent();
7156 return;
7157
7158 case AMDGPU::S_BCNT1_I32_B64:
7159 splitScalar64BitBCNT(Worklist, Inst);
7160 Inst.eraseFromParent();
7161 return;
7162
7163 case AMDGPU::S_BFE_I64:
7164 splitScalar64BitBFE(Worklist, Inst);
7165 Inst.eraseFromParent();
7166 return;
7167
7168 case AMDGPU::S_FLBIT_I32_B64:
7169 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7170 Inst.eraseFromParent();
7171 return;
7172 case AMDGPU::S_FF1_I32_B64:
7173 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7174 Inst.eraseFromParent();
7175 return;
7176
7177 case AMDGPU::S_LSHL_B32:
7178 if (ST.hasOnlyRevVALUShifts()) {
7179 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7180 swapOperands(Inst);
7181 }
7182 break;
7183 case AMDGPU::S_ASHR_I32:
7184 if (ST.hasOnlyRevVALUShifts()) {
7185 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7186 swapOperands(Inst);
7187 }
7188 break;
7189 case AMDGPU::S_LSHR_B32:
7190 if (ST.hasOnlyRevVALUShifts()) {
7191 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7192 swapOperands(Inst);
7193 }
7194 break;
7195 case AMDGPU::S_LSHL_B64:
7196 if (ST.hasOnlyRevVALUShifts()) {
7197 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7198 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7199 : AMDGPU::V_LSHLREV_B64_e64;
7200 swapOperands(Inst);
7201 }
7202 break;
7203 case AMDGPU::S_ASHR_I64:
7204 if (ST.hasOnlyRevVALUShifts()) {
7205 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7206 swapOperands(Inst);
7207 }
7208 break;
7209 case AMDGPU::S_LSHR_B64:
7210 if (ST.hasOnlyRevVALUShifts()) {
7211 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7212 swapOperands(Inst);
7213 }
7214 break;
7215
7216 case AMDGPU::S_ABS_I32:
7217 lowerScalarAbs(Worklist, Inst);
7218 Inst.eraseFromParent();
7219 return;
7220
7221 case AMDGPU::S_CBRANCH_SCC0:
7222 case AMDGPU::S_CBRANCH_SCC1: {
7223 // Clear unused bits of vcc
7224 Register CondReg = Inst.getOperand(1).getReg();
7225 bool IsSCC = CondReg == AMDGPU::SCC;
7226 Register VCC = RI.getVCC();
7227 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7228 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7229 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7230 .addReg(EXEC)
7231 .addReg(IsSCC ? VCC : CondReg);
7232 Inst.removeOperand(1);
7233 } break;
7234
7235 case AMDGPU::S_BFE_U64:
7236 case AMDGPU::S_BFM_B64:
7237 llvm_unreachable("Moving this op to VALU not implemented");
7238
7239 case AMDGPU::S_PACK_LL_B32_B16:
7240 case AMDGPU::S_PACK_LH_B32_B16:
7241 case AMDGPU::S_PACK_HL_B32_B16:
7242 case AMDGPU::S_PACK_HH_B32_B16:
7243 movePackToVALU(Worklist, MRI, Inst);
7244 Inst.eraseFromParent();
7245 return;
7246
7247 case AMDGPU::S_XNOR_B32:
7248 lowerScalarXnor(Worklist, Inst);
7249 Inst.eraseFromParent();
7250 return;
7251
7252 case AMDGPU::S_NAND_B32:
7253 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7254 Inst.eraseFromParent();
7255 return;
7256
7257 case AMDGPU::S_NOR_B32:
7258 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7259 Inst.eraseFromParent();
7260 return;
7261
7262 case AMDGPU::S_ANDN2_B32:
7263 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7264 Inst.eraseFromParent();
7265 return;
7266
7267 case AMDGPU::S_ORN2_B32:
7268 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7269 Inst.eraseFromParent();
7270 return;
7271
7272 // TODO: remove as soon as everything is ready
7273 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7274 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7275 // can only be selected from the uniform SDNode.
7276 case AMDGPU::S_ADD_CO_PSEUDO:
7277 case AMDGPU::S_SUB_CO_PSEUDO: {
7278 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7279 ? AMDGPU::V_ADDC_U32_e64
7280 : AMDGPU::V_SUBB_U32_e64;
7281 const auto *CarryRC = RI.getWaveMaskRegClass();
7282
7283 Register CarryInReg = Inst.getOperand(4).getReg();
7284 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7285 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7286 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7287 .addReg(CarryInReg);
7288 }
7289
7290 Register CarryOutReg = Inst.getOperand(1).getReg();
7291
7292 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7293 MRI.getRegClass(Inst.getOperand(0).getReg())));
7294 MachineInstr *CarryOp =
7295 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7296 .addReg(CarryOutReg, RegState::Define)
7297 .add(Inst.getOperand(2))
7298 .add(Inst.getOperand(3))
7299 .addReg(CarryInReg)
7300 .addImm(0);
7301 legalizeOperands(*CarryOp);
7302 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7303 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7304 Inst.eraseFromParent();
7305 }
7306 return;
7307 case AMDGPU::S_UADDO_PSEUDO:
7308 case AMDGPU::S_USUBO_PSEUDO: {
7309 const DebugLoc &DL = Inst.getDebugLoc();
7310 MachineOperand &Dest0 = Inst.getOperand(0);
7311 MachineOperand &Dest1 = Inst.getOperand(1);
7312 MachineOperand &Src0 = Inst.getOperand(2);
7313 MachineOperand &Src1 = Inst.getOperand(3);
7314
7315 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7316 ? AMDGPU::V_ADD_CO_U32_e64
7317 : AMDGPU::V_SUB_CO_U32_e64;
7318 const TargetRegisterClass *NewRC =
7319 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7320 Register DestReg = MRI.createVirtualRegister(NewRC);
7321 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7322 .addReg(Dest1.getReg(), RegState::Define)
7323 .add(Src0)
7324 .add(Src1)
7325 .addImm(0); // clamp bit
7326
7327 legalizeOperands(*NewInstr, MDT);
7328 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7329 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7330 Worklist);
7331 Inst.eraseFromParent();
7332 }
7333 return;
7334
7335 case AMDGPU::S_CSELECT_B32:
7336 case AMDGPU::S_CSELECT_B64:
7337 lowerSelect(Worklist, Inst, MDT);
7338 Inst.eraseFromParent();
7339 return;
7340 case AMDGPU::S_CMP_EQ_I32:
7341 case AMDGPU::S_CMP_LG_I32:
7342 case AMDGPU::S_CMP_GT_I32:
7343 case AMDGPU::S_CMP_GE_I32:
7344 case AMDGPU::S_CMP_LT_I32:
7345 case AMDGPU::S_CMP_LE_I32:
7346 case AMDGPU::S_CMP_EQ_U32:
7347 case AMDGPU::S_CMP_LG_U32:
7348 case AMDGPU::S_CMP_GT_U32:
7349 case AMDGPU::S_CMP_GE_U32:
7350 case AMDGPU::S_CMP_LT_U32:
7351 case AMDGPU::S_CMP_LE_U32:
7352 case AMDGPU::S_CMP_EQ_U64:
7353 case AMDGPU::S_CMP_LG_U64:
7354 case AMDGPU::S_CMP_LT_F32:
7355 case AMDGPU::S_CMP_EQ_F32:
7356 case AMDGPU::S_CMP_LE_F32:
7357 case AMDGPU::S_CMP_GT_F32:
7358 case AMDGPU::S_CMP_LG_F32:
7359 case AMDGPU::S_CMP_GE_F32:
7360 case AMDGPU::S_CMP_O_F32:
7361 case AMDGPU::S_CMP_U_F32:
7362 case AMDGPU::S_CMP_NGE_F32:
7363 case AMDGPU::S_CMP_NLG_F32:
7364 case AMDGPU::S_CMP_NGT_F32:
7365 case AMDGPU::S_CMP_NLE_F32:
7366 case AMDGPU::S_CMP_NEQ_F32:
7367 case AMDGPU::S_CMP_NLT_F32: {
7368 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7369 auto NewInstr =
7370 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7371 .setMIFlags(Inst.getFlags());
7372 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7373 0) {
7374 NewInstr
7375 .addImm(0) // src0_modifiers
7376 .add(Inst.getOperand(0)) // src0
7377 .addImm(0) // src1_modifiers
7378 .add(Inst.getOperand(1)) // src1
7379 .addImm(0); // clamp
7380 } else {
7381 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7382 }
7383 legalizeOperands(*NewInstr, MDT);
7384 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7385 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7386 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7387 Inst.eraseFromParent();
7388 return;
7389 }
7390 case AMDGPU::S_CMP_LT_F16:
7391 case AMDGPU::S_CMP_EQ_F16:
7392 case AMDGPU::S_CMP_LE_F16:
7393 case AMDGPU::S_CMP_GT_F16:
7394 case AMDGPU::S_CMP_LG_F16:
7395 case AMDGPU::S_CMP_GE_F16:
7396 case AMDGPU::S_CMP_O_F16:
7397 case AMDGPU::S_CMP_U_F16:
7398 case AMDGPU::S_CMP_NGE_F16:
7399 case AMDGPU::S_CMP_NLG_F16:
7400 case AMDGPU::S_CMP_NGT_F16:
7401 case AMDGPU::S_CMP_NLE_F16:
7402 case AMDGPU::S_CMP_NEQ_F16:
7403 case AMDGPU::S_CMP_NLT_F16: {
7404 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7405 auto NewInstr =
7406 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7407 .setMIFlags(Inst.getFlags());
7408 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7409 NewInstr
7410 .addImm(0) // src0_modifiers
7411 .add(Inst.getOperand(0)) // src0
7412 .addImm(0) // src1_modifiers
7413 .add(Inst.getOperand(1)) // src1
7414 .addImm(0); // clamp
7415 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7416 NewInstr.addImm(0); // op_sel0
7417 } else {
7418 NewInstr
7419 .add(Inst.getOperand(0))
7420 .add(Inst.getOperand(1));
7421 }
7422 legalizeOperands(*NewInstr, MDT);
7423 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7424 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7425 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7426 Inst.eraseFromParent();
7427 return;
7428 }
7429 case AMDGPU::S_CVT_HI_F32_F16: {
7430 const DebugLoc &DL = Inst.getDebugLoc();
7431 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7432 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7433 if (ST.useRealTrue16Insts()) {
7434 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7435 .add(Inst.getOperand(1));
7436 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7437 .addImm(0) // src0_modifiers
7438 .addReg(TmpReg, 0, AMDGPU::hi16)
7439 .addImm(0) // clamp
7440 .addImm(0) // omod
7441 .addImm(0); // op_sel0
7442 } else {
7443 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7444 .addImm(16)
7445 .add(Inst.getOperand(1));
7446 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7447 .addImm(0) // src0_modifiers
7448 .addReg(TmpReg)
7449 .addImm(0) // clamp
7450 .addImm(0); // omod
7451 }
7452
7453 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7454 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7455 Inst.eraseFromParent();
7456 return;
7457 }
7458 case AMDGPU::S_MINIMUM_F32:
7459 case AMDGPU::S_MAXIMUM_F32:
7460 case AMDGPU::S_MINIMUM_F16:
7461 case AMDGPU::S_MAXIMUM_F16: {
7462 const DebugLoc &DL = Inst.getDebugLoc();
7463 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7464 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7465 .addImm(0) // src0_modifiers
7466 .add(Inst.getOperand(1))
7467 .addImm(0) // src1_modifiers
7468 .add(Inst.getOperand(2))
7469 .addImm(0) // clamp
7470 .addImm(0); // omod
7471 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7472
7473 legalizeOperands(*NewInstr, MDT);
7474 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7475 Inst.eraseFromParent();
7476 return;
7477 }
7478 }
7479
7480 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7481 // We cannot move this instruction to the VALU, so we should try to
7482 // legalize its operands instead.
7483 legalizeOperands(Inst, MDT);
7484 return;
7485 }
7486 // Handle converting generic instructions like COPY-to-SGPR into
7487 // COPY-to-VGPR.
7488 if (NewOpcode == Opcode) {
7489 Register DstReg = Inst.getOperand(0).getReg();
7490 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7491
7492 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7493 // hope for the best.
7494 if (Inst.isCopy() && DstReg.isPhysical() &&
7495 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7496 // TODO: Only works for 32 bit registers.
7497 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7498 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7499 .add(Inst.getOperand(1));
7500 Inst.eraseFromParent();
7501 return;
7502 }
7503
7504 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7505 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7506 // Instead of creating a copy where src and dst are the same register
7507 // class, we just replace all uses of dst with src. These kinds of
7508 // copies interfere with the heuristics MachineSink uses to decide
7509 // whether or not to split a critical edge. Since the pass assumes
7510 // that copies will end up as machine instructions and not be
7511 // eliminated.
7512 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7513 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7514 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7515 Inst.getOperand(0).setReg(DstReg);
7516 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7517 // these are deleted later, but at -O0 it would leave a suspicious
7518 // looking illegal copy of an undef register.
7519 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7520 Inst.removeOperand(I);
7521 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7522 return;
7523 }
7524 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7525 MRI.replaceRegWith(DstReg, NewDstReg);
7526 legalizeOperands(Inst, MDT);
7527 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7528 return;
7529 }
7530
7531 // Use the new VALU Opcode.
7532 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7533 .setMIFlags(Inst.getFlags());
7534 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7535 // Intersperse VOP3 modifiers among the SALU operands.
7536 NewInstr->addOperand(Inst.getOperand(0));
7537 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7538 AMDGPU::OpName::src0_modifiers) >= 0)
7539 NewInstr.addImm(0);
7540 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7541 MachineOperand Src = Inst.getOperand(1);
7542 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7543 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7544 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7545 else
7546 NewInstr->addOperand(Src);
7547 }
7548
7549 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7550 // We are converting these to a BFE, so we need to add the missing
7551 // operands for the size and offset.
7552 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7553 NewInstr.addImm(0);
7554 NewInstr.addImm(Size);
7555 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7556 // The VALU version adds the second operand to the result, so insert an
7557 // extra 0 operand.
7558 NewInstr.addImm(0);
7559 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7560 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7561 // If we need to move this to VGPRs, we need to unpack the second
7562 // operand back into the 2 separate ones for bit offset and width.
7563 assert(OffsetWidthOp.isImm() &&
7564 "Scalar BFE is only implemented for constant width and offset");
7565 uint32_t Imm = OffsetWidthOp.getImm();
7566
7567 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7568 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7569 NewInstr.addImm(Offset);
7570 NewInstr.addImm(BitWidth);
7571 } else {
7572 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7573 AMDGPU::OpName::src1_modifiers) >= 0)
7574 NewInstr.addImm(0);
7575 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7576 NewInstr->addOperand(Inst.getOperand(2));
7577 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7578 AMDGPU::OpName::src2_modifiers) >= 0)
7579 NewInstr.addImm(0);
7580 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7581 NewInstr->addOperand(Inst.getOperand(3));
7582 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7583 NewInstr.addImm(0);
7584 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7585 NewInstr.addImm(0);
7586 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7587 NewInstr.addImm(0);
7588 }
7589 } else {
7590 // Just copy the SALU operands.
7591 for (const MachineOperand &Op : Inst.explicit_operands())
7592 NewInstr->addOperand(Op);
7593 }
7594
7595 // Remove any references to SCC. Vector instructions can't read from it, and
7596 // We're just about to add the implicit use / defs of VCC, and we don't want
7597 // both.
7598 for (MachineOperand &Op : Inst.implicit_operands()) {
7599 if (Op.getReg() == AMDGPU::SCC) {
7600 // Only propagate through live-def of SCC.
7601 if (Op.isDef() && !Op.isDead())
7602 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7603 if (Op.isUse())
7604 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7605 }
7606 }
7607 Inst.eraseFromParent();
7608 Register NewDstReg;
7609 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7610 Register DstReg = NewInstr->getOperand(0).getReg();
7611 assert(DstReg.isVirtual());
7612 // Update the destination register class.
7613 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7614 assert(NewDstRC);
7615 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7616 MRI.replaceRegWith(DstReg, NewDstReg);
7617 }
7618 fixImplicitOperands(*NewInstr);
7619 // Legalize the operands
7620 legalizeOperands(*NewInstr, MDT);
7621 if (NewDstReg)
7622 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7623}
7624
7625// Add/sub require special handling to deal with carry outs.
7626std::pair<bool, MachineBasicBlock *>
7627SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7628 MachineDominatorTree *MDT) const {
7629 if (ST.hasAddNoCarry()) {
7630 // Assume there is no user of scc since we don't select this in that case.
7631 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7632 // is used.
7633
7634 MachineBasicBlock &MBB = *Inst.getParent();
7636
7637 Register OldDstReg = Inst.getOperand(0).getReg();
7638 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7639
7640 unsigned Opc = Inst.getOpcode();
7641 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7642
7643 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7644 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7645
7646 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7647 Inst.removeOperand(3);
7648
7649 Inst.setDesc(get(NewOpc));
7650 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7652 MRI.replaceRegWith(OldDstReg, ResultReg);
7653 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7654
7655 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7656 return std::pair(true, NewBB);
7657 }
7658
7659 return std::pair(false, nullptr);
7660}
7661
7662void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7663 MachineDominatorTree *MDT) const {
7664
7665 MachineBasicBlock &MBB = *Inst.getParent();
7667 MachineBasicBlock::iterator MII = Inst;
7668 DebugLoc DL = Inst.getDebugLoc();
7669
7670 MachineOperand &Dest = Inst.getOperand(0);
7671 MachineOperand &Src0 = Inst.getOperand(1);
7672 MachineOperand &Src1 = Inst.getOperand(2);
7673 MachineOperand &Cond = Inst.getOperand(3);
7674
7675 Register CondReg = Cond.getReg();
7676 bool IsSCC = (CondReg == AMDGPU::SCC);
7677
7678 // If this is a trivial select where the condition is effectively not SCC
7679 // (CondReg is a source of copy to SCC), then the select is semantically
7680 // equivalent to copying CondReg. Hence, there is no need to create
7681 // V_CNDMASK, we can just use that and bail out.
7682 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7683 (Src1.getImm() == 0)) {
7684 MRI.replaceRegWith(Dest.getReg(), CondReg);
7685 return;
7686 }
7687
7688 Register NewCondReg = CondReg;
7689 if (IsSCC) {
7691 NewCondReg = MRI.createVirtualRegister(TC);
7692
7693 // Now look for the closest SCC def if it is a copy
7694 // replacing the CondReg with the COPY source register
7695 bool CopyFound = false;
7696 for (MachineInstr &CandI :
7698 Inst.getParent()->rend())) {
7699 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
7700 -1) {
7701 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7702 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7703 .addReg(CandI.getOperand(1).getReg());
7704 CopyFound = true;
7705 }
7706 break;
7707 }
7708 }
7709 if (!CopyFound) {
7710 // SCC def is not a copy
7711 // Insert a trivial select instead of creating a copy, because a copy from
7712 // SCC would semantically mean just copying a single bit, but we may need
7713 // the result to be a vector condition mask that needs preserving.
7714 unsigned Opcode =
7715 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
7716 auto NewSelect =
7717 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7718 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7719 }
7720 }
7721
7722 Register NewDestReg = MRI.createVirtualRegister(
7723 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7724 MachineInstr *NewInst;
7725 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7726 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7727 .addImm(0)
7728 .add(Src1) // False
7729 .addImm(0)
7730 .add(Src0) // True
7731 .addReg(NewCondReg);
7732 } else {
7733 NewInst =
7734 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7735 .add(Src1) // False
7736 .add(Src0) // True
7737 .addReg(NewCondReg);
7738 }
7739 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7740 legalizeOperands(*NewInst, MDT);
7741 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7742}
7743
7744void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7745 MachineInstr &Inst) const {
7746 MachineBasicBlock &MBB = *Inst.getParent();
7748 MachineBasicBlock::iterator MII = Inst;
7749 DebugLoc DL = Inst.getDebugLoc();
7750
7751 MachineOperand &Dest = Inst.getOperand(0);
7752 MachineOperand &Src = Inst.getOperand(1);
7753 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7754 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7755
7756 unsigned SubOp = ST.hasAddNoCarry() ?
7757 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7758
7759 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7760 .addImm(0)
7761 .addReg(Src.getReg());
7762
7763 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7764 .addReg(Src.getReg())
7765 .addReg(TmpReg);
7766
7767 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7768 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7769}
7770
7771void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7772 MachineInstr &Inst) const {
7773 MachineBasicBlock &MBB = *Inst.getParent();
7775 MachineBasicBlock::iterator MII = Inst;
7776 const DebugLoc &DL = Inst.getDebugLoc();
7777
7778 MachineOperand &Dest = Inst.getOperand(0);
7779 MachineOperand &Src0 = Inst.getOperand(1);
7780 MachineOperand &Src1 = Inst.getOperand(2);
7781
7782 if (ST.hasDLInsts()) {
7783 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7784 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7785 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7786
7787 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7788 .add(Src0)
7789 .add(Src1);
7790
7791 MRI.replaceRegWith(Dest.getReg(), NewDest);
7792 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7793 } else {
7794 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7795 // invert either source and then perform the XOR. If either source is a
7796 // scalar register, then we can leave the inversion on the scalar unit to
7797 // achieve a better distribution of scalar and vector instructions.
7798 bool Src0IsSGPR = Src0.isReg() &&
7799 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7800 bool Src1IsSGPR = Src1.isReg() &&
7801 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7803 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7804 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7805
7806 // Build a pair of scalar instructions and add them to the work list.
7807 // The next iteration over the work list will lower these to the vector
7808 // unit as necessary.
7809 if (Src0IsSGPR) {
7810 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7811 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7812 .addReg(Temp)
7813 .add(Src1);
7814 } else if (Src1IsSGPR) {
7815 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7816 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7817 .add(Src0)
7818 .addReg(Temp);
7819 } else {
7820 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7821 .add(Src0)
7822 .add(Src1);
7823 MachineInstr *Not =
7824 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7825 Worklist.insert(Not);
7826 }
7827
7828 MRI.replaceRegWith(Dest.getReg(), NewDest);
7829
7830 Worklist.insert(Xor);
7831
7832 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7833 }
7834}
7835
7836void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7837 MachineInstr &Inst,
7838 unsigned Opcode) const {
7839 MachineBasicBlock &MBB = *Inst.getParent();
7841 MachineBasicBlock::iterator MII = Inst;
7842 const DebugLoc &DL = Inst.getDebugLoc();
7843
7844 MachineOperand &Dest = Inst.getOperand(0);
7845 MachineOperand &Src0 = Inst.getOperand(1);
7846 MachineOperand &Src1 = Inst.getOperand(2);
7847
7848 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7849 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7850
7851 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7852 .add(Src0)
7853 .add(Src1);
7854
7855 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7856 .addReg(Interm);
7857
7858 Worklist.insert(&Op);
7859 Worklist.insert(&Not);
7860
7861 MRI.replaceRegWith(Dest.getReg(), NewDest);
7862 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7863}
7864
7865void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7866 MachineInstr &Inst,
7867 unsigned Opcode) const {
7868 MachineBasicBlock &MBB = *Inst.getParent();
7870 MachineBasicBlock::iterator MII = Inst;
7871 const DebugLoc &DL = Inst.getDebugLoc();
7872
7873 MachineOperand &Dest = Inst.getOperand(0);
7874 MachineOperand &Src0 = Inst.getOperand(1);
7875 MachineOperand &Src1 = Inst.getOperand(2);
7876
7877 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7878 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7879
7880 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7881 .add(Src1);
7882
7883 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7884 .add(Src0)
7885 .addReg(Interm);
7886
7887 Worklist.insert(&Not);
7888 Worklist.insert(&Op);
7889
7890 MRI.replaceRegWith(Dest.getReg(), NewDest);
7891 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7892}
7893
7894void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
7895 MachineInstr &Inst, unsigned Opcode,
7896 bool Swap) const {
7897 MachineBasicBlock &MBB = *Inst.getParent();
7899
7900 MachineOperand &Dest = Inst.getOperand(0);
7901 MachineOperand &Src0 = Inst.getOperand(1);
7902 DebugLoc DL = Inst.getDebugLoc();
7903
7904 MachineBasicBlock::iterator MII = Inst;
7905
7906 const MCInstrDesc &InstDesc = get(Opcode);
7907 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7908 MRI.getRegClass(Src0.getReg()) :
7909 &AMDGPU::SGPR_32RegClass;
7910
7911 const TargetRegisterClass *Src0SubRC =
7912 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7913
7914 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7915 AMDGPU::sub0, Src0SubRC);
7916
7917 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7918 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7919 const TargetRegisterClass *NewDestSubRC =
7920 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7921
7922 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7923 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
7924
7925 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7926 AMDGPU::sub1, Src0SubRC);
7927
7928 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
7929 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
7930
7931 if (Swap)
7932 std::swap(DestSub0, DestSub1);
7933
7934 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
7935 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7936 .addReg(DestSub0)
7937 .addImm(AMDGPU::sub0)
7938 .addReg(DestSub1)
7939 .addImm(AMDGPU::sub1);
7940
7941 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7942
7943 Worklist.insert(&LoHalf);
7944 Worklist.insert(&HiHalf);
7945
7946 // We don't need to legalizeOperands here because for a single operand, src0
7947 // will support any kind of input.
7948
7949 // Move all users of this moved value.
7950 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7951}
7952
7953// There is not a vector equivalent of s_mul_u64. For this reason, we need to
7954// split the s_mul_u64 in 32-bit vector multiplications.
7955void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
7956 MachineInstr &Inst,
7957 MachineDominatorTree *MDT) const {
7958 MachineBasicBlock &MBB = *Inst.getParent();
7960
7961 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7962 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7963 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7964
7965 MachineOperand &Dest = Inst.getOperand(0);
7966 MachineOperand &Src0 = Inst.getOperand(1);
7967 MachineOperand &Src1 = Inst.getOperand(2);
7968 const DebugLoc &DL = Inst.getDebugLoc();
7969 MachineBasicBlock::iterator MII = Inst;
7970
7971 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7972 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7973 const TargetRegisterClass *Src0SubRC =
7974 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7975 if (RI.isSGPRClass(Src0SubRC))
7976 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7977 const TargetRegisterClass *Src1SubRC =
7978 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7979 if (RI.isSGPRClass(Src1SubRC))
7980 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7981
7982 // First, we extract the low 32-bit and high 32-bit values from each of the
7983 // operands.
7984 MachineOperand Op0L =
7985 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7986 MachineOperand Op1L =
7987 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7988 MachineOperand Op0H =
7989 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
7990 MachineOperand Op1H =
7991 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
7992
7993 // The multilication is done as follows:
7994 //
7995 // Op1H Op1L
7996 // * Op0H Op0L
7997 // --------------------
7998 // Op1H*Op0L Op1L*Op0L
7999 // + Op1H*Op0H Op1L*Op0H
8000 // -----------------------------------------
8001 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8002 //
8003 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8004 // value and that would overflow.
8005 // The low 32-bit value is Op1L*Op0L.
8006 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8007
8008 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8009 MachineInstr *Op1L_Op0H =
8010 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8011 .add(Op1L)
8012 .add(Op0H);
8013
8014 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8015 MachineInstr *Op1H_Op0L =
8016 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8017 .add(Op1H)
8018 .add(Op0L);
8019
8020 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8021 MachineInstr *Carry =
8022 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8023 .add(Op1L)
8024 .add(Op0L);
8025
8026 MachineInstr *LoHalf =
8027 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8028 .add(Op1L)
8029 .add(Op0L);
8030
8031 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8032 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8033 .addReg(Op1L_Op0H_Reg)
8034 .addReg(Op1H_Op0L_Reg);
8035
8036 MachineInstr *HiHalf =
8037 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8038 .addReg(AddReg)
8039 .addReg(CarryReg);
8040
8041 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8042 .addReg(DestSub0)
8043 .addImm(AMDGPU::sub0)
8044 .addReg(DestSub1)
8045 .addImm(AMDGPU::sub1);
8046
8047 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8048
8049 // Try to legalize the operands in case we need to swap the order to keep it
8050 // valid.
8051 legalizeOperands(*Op1L_Op0H, MDT);
8052 legalizeOperands(*Op1H_Op0L, MDT);
8053 legalizeOperands(*Carry, MDT);
8054 legalizeOperands(*LoHalf, MDT);
8055 legalizeOperands(*Add, MDT);
8056 legalizeOperands(*HiHalf, MDT);
8057
8058 // Move all users of this moved value.
8059 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8060}
8061
8062// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8063// multiplications.
8064void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8065 MachineInstr &Inst,
8066 MachineDominatorTree *MDT) const {
8067 MachineBasicBlock &MBB = *Inst.getParent();
8069
8070 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8071 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8072 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8073
8074 MachineOperand &Dest = Inst.getOperand(0);
8075 MachineOperand &Src0 = Inst.getOperand(1);
8076 MachineOperand &Src1 = Inst.getOperand(2);
8077 const DebugLoc &DL = Inst.getDebugLoc();
8078 MachineBasicBlock::iterator MII = Inst;
8079
8080 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8081 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8082 const TargetRegisterClass *Src0SubRC =
8083 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8084 if (RI.isSGPRClass(Src0SubRC))
8085 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8086 const TargetRegisterClass *Src1SubRC =
8087 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8088 if (RI.isSGPRClass(Src1SubRC))
8089 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8090
8091 // First, we extract the low 32-bit and high 32-bit values from each of the
8092 // operands.
8093 MachineOperand Op0L =
8094 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8095 MachineOperand Op1L =
8096 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8097
8098 unsigned Opc = Inst.getOpcode();
8099 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8100 ? AMDGPU::V_MUL_HI_U32_e64
8101 : AMDGPU::V_MUL_HI_I32_e64;
8102 MachineInstr *HiHalf =
8103 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8104
8105 MachineInstr *LoHalf =
8106 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8107 .add(Op1L)
8108 .add(Op0L);
8109
8110 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8111 .addReg(DestSub0)
8112 .addImm(AMDGPU::sub0)
8113 .addReg(DestSub1)
8114 .addImm(AMDGPU::sub1);
8115
8116 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8117
8118 // Try to legalize the operands in case we need to swap the order to keep it
8119 // valid.
8120 legalizeOperands(*HiHalf, MDT);
8121 legalizeOperands(*LoHalf, MDT);
8122
8123 // Move all users of this moved value.
8124 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8125}
8126
8127void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8128 MachineInstr &Inst, unsigned Opcode,
8129 MachineDominatorTree *MDT) const {
8130 MachineBasicBlock &MBB = *Inst.getParent();
8132
8133 MachineOperand &Dest = Inst.getOperand(0);
8134 MachineOperand &Src0 = Inst.getOperand(1);
8135 MachineOperand &Src1 = Inst.getOperand(2);
8136 DebugLoc DL = Inst.getDebugLoc();
8137
8138 MachineBasicBlock::iterator MII = Inst;
8139
8140 const MCInstrDesc &InstDesc = get(Opcode);
8141 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8142 MRI.getRegClass(Src0.getReg()) :
8143 &AMDGPU::SGPR_32RegClass;
8144
8145 const TargetRegisterClass *Src0SubRC =
8146 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8147 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8148 MRI.getRegClass(Src1.getReg()) :
8149 &AMDGPU::SGPR_32RegClass;
8150
8151 const TargetRegisterClass *Src1SubRC =
8152 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8153
8154 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8155 AMDGPU::sub0, Src0SubRC);
8156 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8157 AMDGPU::sub0, Src1SubRC);
8158 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8159 AMDGPU::sub1, Src0SubRC);
8160 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8161 AMDGPU::sub1, Src1SubRC);
8162
8163 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8164 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8165 const TargetRegisterClass *NewDestSubRC =
8166 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8167
8168 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8169 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8170 .add(SrcReg0Sub0)
8171 .add(SrcReg1Sub0);
8172
8173 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8174 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8175 .add(SrcReg0Sub1)
8176 .add(SrcReg1Sub1);
8177
8178 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8179 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8180 .addReg(DestSub0)
8181 .addImm(AMDGPU::sub0)
8182 .addReg(DestSub1)
8183 .addImm(AMDGPU::sub1);
8184
8185 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8186
8187 Worklist.insert(&LoHalf);
8188 Worklist.insert(&HiHalf);
8189
8190 // Move all users of this moved value.
8191 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8192}
8193
8194void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8195 MachineInstr &Inst,
8196 MachineDominatorTree *MDT) const {
8197 MachineBasicBlock &MBB = *Inst.getParent();
8199
8200 MachineOperand &Dest = Inst.getOperand(0);
8201 MachineOperand &Src0 = Inst.getOperand(1);
8202 MachineOperand &Src1 = Inst.getOperand(2);
8203 const DebugLoc &DL = Inst.getDebugLoc();
8204
8205 MachineBasicBlock::iterator MII = Inst;
8206
8207 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8208
8209 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8210
8211 MachineOperand* Op0;
8212 MachineOperand* Op1;
8213
8214 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8215 Op0 = &Src0;
8216 Op1 = &Src1;
8217 } else {
8218 Op0 = &Src1;
8219 Op1 = &Src0;
8220 }
8221
8222 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8223 .add(*Op0);
8224
8225 Register NewDest = MRI.createVirtualRegister(DestRC);
8226
8227 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8228 .addReg(Interm)
8229 .add(*Op1);
8230
8231 MRI.replaceRegWith(Dest.getReg(), NewDest);
8232
8233 Worklist.insert(&Xor);
8234}
8235
8236void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8237 MachineInstr &Inst) const {
8238 MachineBasicBlock &MBB = *Inst.getParent();
8240
8241 MachineBasicBlock::iterator MII = Inst;
8242 const DebugLoc &DL = Inst.getDebugLoc();
8243
8244 MachineOperand &Dest = Inst.getOperand(0);
8245 MachineOperand &Src = Inst.getOperand(1);
8246
8247 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8248 const TargetRegisterClass *SrcRC = Src.isReg() ?
8249 MRI.getRegClass(Src.getReg()) :
8250 &AMDGPU::SGPR_32RegClass;
8251
8252 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8253 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8254
8255 const TargetRegisterClass *SrcSubRC =
8256 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8257
8258 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8259 AMDGPU::sub0, SrcSubRC);
8260 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8261 AMDGPU::sub1, SrcSubRC);
8262
8263 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8264
8265 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8266
8267 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8268
8269 // We don't need to legalize operands here. src0 for either instruction can be
8270 // an SGPR, and the second input is unused or determined here.
8271 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8272}
8273
8274void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8275 MachineInstr &Inst) const {
8276 MachineBasicBlock &MBB = *Inst.getParent();
8278 MachineBasicBlock::iterator MII = Inst;
8279 const DebugLoc &DL = Inst.getDebugLoc();
8280
8281 MachineOperand &Dest = Inst.getOperand(0);
8282 uint32_t Imm = Inst.getOperand(2).getImm();
8283 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8284 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8285
8286 (void) Offset;
8287
8288 // Only sext_inreg cases handled.
8289 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8290 Offset == 0 && "Not implemented");
8291
8292 if (BitWidth < 32) {
8293 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8294 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8295 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8296
8297 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8298 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8299 .addImm(0)
8300 .addImm(BitWidth);
8301
8302 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8303 .addImm(31)
8304 .addReg(MidRegLo);
8305
8306 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8307 .addReg(MidRegLo)
8308 .addImm(AMDGPU::sub0)
8309 .addReg(MidRegHi)
8310 .addImm(AMDGPU::sub1);
8311
8312 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8313 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8314 return;
8315 }
8316
8317 MachineOperand &Src = Inst.getOperand(1);
8318 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8319 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8320
8321 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8322 .addImm(31)
8323 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8324
8325 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8326 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8327 .addImm(AMDGPU::sub0)
8328 .addReg(TmpReg)
8329 .addImm(AMDGPU::sub1);
8330
8331 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8332 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8333}
8334
8335void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8336 MachineInstr &Inst, unsigned Opcode,
8337 MachineDominatorTree *MDT) const {
8338 // (S_FLBIT_I32_B64 hi:lo) ->
8339 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8340 // (S_FF1_I32_B64 hi:lo) ->
8341 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8342
8343 MachineBasicBlock &MBB = *Inst.getParent();
8345 MachineBasicBlock::iterator MII = Inst;
8346 const DebugLoc &DL = Inst.getDebugLoc();
8347
8348 MachineOperand &Dest = Inst.getOperand(0);
8349 MachineOperand &Src = Inst.getOperand(1);
8350
8351 const MCInstrDesc &InstDesc = get(Opcode);
8352
8353 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8354 unsigned OpcodeAdd =
8355 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8356
8357 const TargetRegisterClass *SrcRC =
8358 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8359 const TargetRegisterClass *SrcSubRC =
8360 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8361
8362 MachineOperand SrcRegSub0 =
8363 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8364 MachineOperand SrcRegSub1 =
8365 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8366
8367 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8368 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8369 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8370 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8371
8372 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8373
8374 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8375
8376 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8377 .addReg(IsCtlz ? MidReg1 : MidReg2)
8378 .addImm(32)
8379 .addImm(1); // enable clamp
8380
8381 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8382 .addReg(MidReg3)
8383 .addReg(IsCtlz ? MidReg2 : MidReg1);
8384
8385 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8386
8387 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8388}
8389
8390void SIInstrInfo::addUsersToMoveToVALUWorklist(
8392 SIInstrWorklist &Worklist) const {
8393 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8394 E = MRI.use_end(); I != E;) {
8395 MachineInstr &UseMI = *I->getParent();
8396
8397 unsigned OpNo = 0;
8398
8399 switch (UseMI.getOpcode()) {
8400 case AMDGPU::COPY:
8401 case AMDGPU::WQM:
8402 case AMDGPU::SOFT_WQM:
8403 case AMDGPU::STRICT_WWM:
8404 case AMDGPU::STRICT_WQM:
8405 case AMDGPU::REG_SEQUENCE:
8406 case AMDGPU::PHI:
8407 case AMDGPU::INSERT_SUBREG:
8408 break;
8409 default:
8410 OpNo = I.getOperandNo();
8411 break;
8412 }
8413
8414 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8415 Worklist.insert(&UseMI);
8416
8417 do {
8418 ++I;
8419 } while (I != E && I->getParent() == &UseMI);
8420 } else {
8421 ++I;
8422 }
8423 }
8424}
8425
8426void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8428 MachineInstr &Inst) const {
8429 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8431 MachineOperand &Src0 = Inst.getOperand(1);
8432 MachineOperand &Src1 = Inst.getOperand(2);
8433 const DebugLoc &DL = Inst.getDebugLoc();
8434
8435 switch (Inst.getOpcode()) {
8436 case AMDGPU::S_PACK_LL_B32_B16: {
8437 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8438 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8439
8440 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8441 // 0.
8442 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8443 .addImm(0xffff);
8444
8445 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8446 .addReg(ImmReg, RegState::Kill)
8447 .add(Src0);
8448
8449 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8450 .add(Src1)
8451 .addImm(16)
8452 .addReg(TmpReg, RegState::Kill);
8453 break;
8454 }
8455 case AMDGPU::S_PACK_LH_B32_B16: {
8456 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8457 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8458 .addImm(0xffff);
8459 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8460 .addReg(ImmReg, RegState::Kill)
8461 .add(Src0)
8462 .add(Src1);
8463 break;
8464 }
8465 case AMDGPU::S_PACK_HL_B32_B16: {
8466 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8467 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8468 .addImm(16)
8469 .add(Src0);
8470 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8471 .add(Src1)
8472 .addImm(16)
8473 .addReg(TmpReg, RegState::Kill);
8474 break;
8475 }
8476 case AMDGPU::S_PACK_HH_B32_B16: {
8477 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8478 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8479 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8480 .addImm(16)
8481 .add(Src0);
8482 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8483 .addImm(0xffff0000);
8484 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8485 .add(Src1)
8486 .addReg(ImmReg, RegState::Kill)
8487 .addReg(TmpReg, RegState::Kill);
8488 break;
8489 }
8490 default:
8491 llvm_unreachable("unhandled s_pack_* instruction");
8492 }
8493
8494 MachineOperand &Dest = Inst.getOperand(0);
8495 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8496 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8497}
8498
8499void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8500 MachineInstr &SCCDefInst,
8501 SIInstrWorklist &Worklist,
8502 Register NewCond) const {
8503
8504 // Ensure that def inst defines SCC, which is still live.
8505 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8506 !Op.isDead() && Op.getParent() == &SCCDefInst);
8507 SmallVector<MachineInstr *, 4> CopyToDelete;
8508 // This assumes that all the users of SCC are in the same block
8509 // as the SCC def.
8510 for (MachineInstr &MI : // Skip the def inst itself.
8511 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8512 SCCDefInst.getParent()->end())) {
8513 // Check if SCC is used first.
8514 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
8515 if (SCCIdx != -1) {
8516 if (MI.isCopy()) {
8517 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8518 Register DestReg = MI.getOperand(0).getReg();
8519
8520 MRI.replaceRegWith(DestReg, NewCond);
8521 CopyToDelete.push_back(&MI);
8522 } else {
8523
8524 if (NewCond.isValid())
8525 MI.getOperand(SCCIdx).setReg(NewCond);
8526
8527 Worklist.insert(&MI);
8528 }
8529 }
8530 // Exit if we find another SCC def.
8531 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8532 break;
8533 }
8534 for (auto &Copy : CopyToDelete)
8535 Copy->eraseFromParent();
8536}
8537
8538// Instructions that use SCC may be converted to VALU instructions. When that
8539// happens, the SCC register is changed to VCC_LO. The instruction that defines
8540// SCC must be changed to an instruction that defines VCC. This function makes
8541// sure that the instruction that defines SCC is added to the moveToVALU
8542// worklist.
8543void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8544 SIInstrWorklist &Worklist) const {
8545 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8546 // then there is nothing to do because the defining instruction has been
8547 // converted to a VALU already. If SCC then that instruction needs to be
8548 // converted to a VALU.
8549 for (MachineInstr &MI :
8550 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8551 SCCUseInst->getParent()->rend())) {
8552 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8553 break;
8554 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8555 Worklist.insert(&MI);
8556 break;
8557 }
8558 }
8559}
8560
8561const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8562 const MachineInstr &Inst) const {
8563 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8564
8565 switch (Inst.getOpcode()) {
8566 // For target instructions, getOpRegClass just returns the virtual register
8567 // class associated with the operand, so we need to find an equivalent VGPR
8568 // register class in order to move the instruction to the VALU.
8569 case AMDGPU::COPY:
8570 case AMDGPU::PHI:
8571 case AMDGPU::REG_SEQUENCE:
8572 case AMDGPU::INSERT_SUBREG:
8573 case AMDGPU::WQM:
8574 case AMDGPU::SOFT_WQM:
8575 case AMDGPU::STRICT_WWM:
8576 case AMDGPU::STRICT_WQM: {
8577 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8578 if (RI.isAGPRClass(SrcRC)) {
8579 if (RI.isAGPRClass(NewDstRC))
8580 return nullptr;
8581
8582 switch (Inst.getOpcode()) {
8583 case AMDGPU::PHI:
8584 case AMDGPU::REG_SEQUENCE:
8585 case AMDGPU::INSERT_SUBREG:
8586 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8587 break;
8588 default:
8589 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8590 }
8591
8592 if (!NewDstRC)
8593 return nullptr;
8594 } else {
8595 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8596 return nullptr;
8597
8598 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8599 if (!NewDstRC)
8600 return nullptr;
8601 }
8602
8603 return NewDstRC;
8604 }
8605 default:
8606 return NewDstRC;
8607 }
8608}
8609
8610// Find the one SGPR operand we are allowed to use.
8611Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8612 int OpIndices[3]) const {
8613 const MCInstrDesc &Desc = MI.getDesc();
8614
8615 // Find the one SGPR operand we are allowed to use.
8616 //
8617 // First we need to consider the instruction's operand requirements before
8618 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8619 // of VCC, but we are still bound by the constant bus requirement to only use
8620 // one.
8621 //
8622 // If the operand's class is an SGPR, we can never move it.
8623
8624 Register SGPRReg = findImplicitSGPRRead(MI);
8625 if (SGPRReg)
8626 return SGPRReg;
8627
8628 Register UsedSGPRs[3] = {Register()};
8629 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8630
8631 for (unsigned i = 0; i < 3; ++i) {
8632 int Idx = OpIndices[i];
8633 if (Idx == -1)
8634 break;
8635
8636 const MachineOperand &MO = MI.getOperand(Idx);
8637 if (!MO.isReg())
8638 continue;
8639
8640 // Is this operand statically required to be an SGPR based on the operand
8641 // constraints?
8642 const TargetRegisterClass *OpRC =
8643 RI.getRegClass(Desc.operands()[Idx].RegClass);
8644 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8645 if (IsRequiredSGPR)
8646 return MO.getReg();
8647
8648 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8649 Register Reg = MO.getReg();
8650 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8651 if (RI.isSGPRClass(RegRC))
8652 UsedSGPRs[i] = Reg;
8653 }
8654
8655 // We don't have a required SGPR operand, so we have a bit more freedom in
8656 // selecting operands to move.
8657
8658 // Try to select the most used SGPR. If an SGPR is equal to one of the
8659 // others, we choose that.
8660 //
8661 // e.g.
8662 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8663 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8664
8665 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8666 // prefer those.
8667
8668 if (UsedSGPRs[0]) {
8669 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8670 SGPRReg = UsedSGPRs[0];
8671 }
8672
8673 if (!SGPRReg && UsedSGPRs[1]) {
8674 if (UsedSGPRs[1] == UsedSGPRs[2])
8675 SGPRReg = UsedSGPRs[1];
8676 }
8677
8678 return SGPRReg;
8679}
8680
8682 unsigned OperandName) const {
8683 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8684 if (Idx == -1)
8685 return nullptr;
8686
8687 return &MI.getOperand(Idx);
8688}
8689
8695 return (Format << 44) |
8696 (1ULL << 56) | // RESOURCE_LEVEL = 1
8697 (3ULL << 60); // OOB_SELECT = 3
8698 }
8699
8700 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8701 if (ST.isAmdHsaOS()) {
8702 // Set ATC = 1. GFX9 doesn't have this bit.
8704 RsrcDataFormat |= (1ULL << 56);
8705
8706 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8707 // BTW, it disables TC L2 and therefore decreases performance.
8709 RsrcDataFormat |= (2ULL << 59);
8710 }
8711
8712 return RsrcDataFormat;
8713}
8714
8718 0xffffffff; // Size;
8719
8720 // GFX9 doesn't have ELEMENT_SIZE.
8722 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8723 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8724 }
8725
8726 // IndexStride = 64 / 32.
8727 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
8728 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8729
8730 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8731 // Clear them unless we want a huge stride.
8734 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8735
8736 return Rsrc23;
8737}
8738
8740 unsigned Opc = MI.getOpcode();
8741
8742 return isSMRD(Opc);
8743}
8744
8746 return get(Opc).mayLoad() &&
8747 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8748}
8749
8751 int &FrameIndex) const {
8752 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8753 if (!Addr || !Addr->isFI())
8754 return Register();
8755
8756 assert(!MI.memoperands_empty() &&
8757 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8758
8759 FrameIndex = Addr->getIndex();
8760 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8761}
8762
8764 int &FrameIndex) const {
8765 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8766 assert(Addr && Addr->isFI());
8767 FrameIndex = Addr->getIndex();
8768 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8769}
8770
8772 int &FrameIndex) const {
8773 if (!MI.mayLoad())
8774 return Register();
8775
8776 if (isMUBUF(MI) || isVGPRSpill(MI))
8777 return isStackAccess(MI, FrameIndex);
8778
8779 if (isSGPRSpill(MI))
8780 return isSGPRStackAccess(MI, FrameIndex);
8781
8782 return Register();
8783}
8784
8786 int &FrameIndex) const {
8787 if (!MI.mayStore())
8788 return Register();
8789
8790 if (isMUBUF(MI) || isVGPRSpill(MI))
8791 return isStackAccess(MI, FrameIndex);
8792
8793 if (isSGPRSpill(MI))
8794 return isSGPRStackAccess(MI, FrameIndex);
8795
8796 return Register();
8797}
8798
8800 unsigned Size = 0;
8802 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8803 while (++I != E && I->isInsideBundle()) {
8804 assert(!I->isBundle() && "No nested bundle!");
8806 }
8807
8808 return Size;
8809}
8810
8812 unsigned Opc = MI.getOpcode();
8814 unsigned DescSize = Desc.getSize();
8815
8816 // If we have a definitive size, we can use it. Otherwise we need to inspect
8817 // the operands to know the size.
8818 if (isFixedSize(MI)) {
8819 unsigned Size = DescSize;
8820
8821 // If we hit the buggy offset, an extra nop will be inserted in MC so
8822 // estimate the worst case.
8823 if (MI.isBranch() && ST.hasOffset3fBug())
8824 Size += 4;
8825
8826 return Size;
8827 }
8828
8829 // Instructions may have a 32-bit literal encoded after them. Check
8830 // operands that could ever be literals.
8831 if (isVALU(MI) || isSALU(MI)) {
8832 if (isDPP(MI))
8833 return DescSize;
8834 bool HasLiteral = false;
8835 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8836 const MachineOperand &Op = MI.getOperand(I);
8837 const MCOperandInfo &OpInfo = Desc.operands()[I];
8838 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8839 HasLiteral = true;
8840 break;
8841 }
8842 }
8843 return HasLiteral ? DescSize + 4 : DescSize;
8844 }
8845
8846 // Check whether we have extra NSA words.
8847 if (isMIMG(MI)) {
8848 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8849 if (VAddr0Idx < 0)
8850 return 8;
8851
8852 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8853 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8854 }
8855
8856 switch (Opc) {
8857 case TargetOpcode::BUNDLE:
8858 return getInstBundleSize(MI);
8859 case TargetOpcode::INLINEASM:
8860 case TargetOpcode::INLINEASM_BR: {
8861 const MachineFunction *MF = MI.getParent()->getParent();
8862 const char *AsmStr = MI.getOperand(0).getSymbolName();
8863 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8864 }
8865 default:
8866 if (MI.isMetaInstruction())
8867 return 0;
8868 return DescSize;
8869 }
8870}
8871
8873 if (!isFLAT(MI))
8874 return false;
8875
8876 if (MI.memoperands_empty())
8877 return true;
8878
8879 for (const MachineMemOperand *MMO : MI.memoperands()) {
8880 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8881 return true;
8882 }
8883 return false;
8884}
8885
8888 static const std::pair<int, const char *> TargetIndices[] = {
8889 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8890 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8891 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8892 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8893 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8894 return ArrayRef(TargetIndices);
8895}
8896
8897/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8898/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8901 const ScheduleDAG *DAG) const {
8902 return new GCNHazardRecognizer(DAG->MF);
8903}
8904
8905/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8906/// pass.
8909 return new GCNHazardRecognizer(MF);
8910}
8911
8912// Called during:
8913// - pre-RA scheduling and post-RA scheduling
8916 const ScheduleDAGMI *DAG) const {
8917 // Borrowed from Arm Target
8918 // We would like to restrict this hazard recognizer to only
8919 // post-RA scheduling; we can tell that we're post-RA because we don't
8920 // track VRegLiveness.
8921 if (!DAG->hasVRegLiveness())
8922 return new GCNHazardRecognizer(DAG->MF);
8924}
8925
8926std::pair<unsigned, unsigned>
8928 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
8929}
8930
8933 static const std::pair<unsigned, const char *> TargetFlags[] = {
8934 { MO_GOTPCREL, "amdgpu-gotprel" },
8935 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
8936 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
8937 { MO_REL32_LO, "amdgpu-rel32-lo" },
8938 { MO_REL32_HI, "amdgpu-rel32-hi" },
8939 { MO_ABS32_LO, "amdgpu-abs32-lo" },
8940 { MO_ABS32_HI, "amdgpu-abs32-hi" },
8941 };
8942
8943 return ArrayRef(TargetFlags);
8944}
8945
8948 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8949 {
8950 {MONoClobber, "amdgpu-noclobber"},
8951 {MOLastUse, "amdgpu-last-use"},
8952 };
8953
8954 return ArrayRef(TargetFlags);
8955}
8956
8958 const MachineFunction &MF) const {
8960 assert(SrcReg.isVirtual());
8961 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8962 return AMDGPU::WWM_COPY;
8963
8964 return AMDGPU::COPY;
8965}
8966
8968 Register Reg) const {
8969 // We need to handle instructions which may be inserted during register
8970 // allocation to handle the prolog. The initial prolog instruction may have
8971 // been separated from the start of the block by spills and copies inserted
8972 // needed by the prolog. However, the insertions for scalar registers can
8973 // always be placed at the BB top as they are independent of the exec mask
8974 // value.
8975 const MachineFunction *MF = MI.getParent()->getParent();
8976 bool IsNullOrVectorRegister = true;
8977 if (Reg) {
8978 const MachineRegisterInfo &MRI = MF->getRegInfo();
8979 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
8980 }
8981
8982 uint16_t Opcode = MI.getOpcode();
8984 return IsNullOrVectorRegister &&
8985 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
8986 (Opcode == AMDGPU::IMPLICIT_DEF &&
8987 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
8988 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
8989 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
8990}
8991
8995 const DebugLoc &DL,
8996 Register DestReg) const {
8997 if (ST.hasAddNoCarry())
8998 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
8999
9001 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9002 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9003
9004 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9005 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9006}
9007
9010 const DebugLoc &DL,
9011 Register DestReg,
9012 RegScavenger &RS) const {
9013 if (ST.hasAddNoCarry())
9014 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9015
9016 // If available, prefer to use vcc.
9017 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9018 ? Register(RI.getVCC())
9020 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9021 0, /* AllowSpill */ false);
9022
9023 // TODO: Users need to deal with this.
9024 if (!UnusedCarry.isValid())
9025 return MachineInstrBuilder();
9026
9027 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9028 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9029}
9030
9031bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9032 switch (Opcode) {
9033 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9034 case AMDGPU::SI_KILL_I1_TERMINATOR:
9035 return true;
9036 default:
9037 return false;
9038 }
9039}
9040
9042 switch (Opcode) {
9043 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9044 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9045 case AMDGPU::SI_KILL_I1_PSEUDO:
9046 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9047 default:
9048 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9049 }
9050}
9051
9052bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9053 return Imm <= getMaxMUBUFImmOffset(ST);
9054}
9055
9057 // GFX12 field is non-negative 24-bit signed byte offset.
9058 const unsigned OffsetBits =
9059 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9060 return (1 << OffsetBits) - 1;
9061}
9062
9064 if (!ST.isWave32())
9065 return;
9066
9067 if (MI.isInlineAsm())
9068 return;
9069
9070 for (auto &Op : MI.implicit_operands()) {
9071 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9072 Op.setReg(AMDGPU::VCC_LO);
9073 }
9074}
9075
9077 if (!isSMRD(MI))
9078 return false;
9079
9080 // Check that it is using a buffer resource.
9081 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9082 if (Idx == -1) // e.g. s_memtime
9083 return false;
9084
9085 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9086 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9087}
9088
9089// Given Imm, split it into the values to put into the SOffset and ImmOffset
9090// fields in an MUBUF instruction. Return false if it is not possible (due to a
9091// hardware bug needing a workaround).
9092//
9093// The required alignment ensures that individual address components remain
9094// aligned if they are aligned to begin with. It also ensures that additional
9095// offsets within the given alignment can be added to the resulting ImmOffset.
9097 uint32_t &ImmOffset, Align Alignment) const {
9098 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9099 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9100 uint32_t Overflow = 0;
9101
9102 if (Imm > MaxImm) {
9103 if (Imm <= MaxImm + 64) {
9104 // Use an SOffset inline constant for 4..64
9105 Overflow = Imm - MaxImm;
9106 Imm = MaxImm;
9107 } else {
9108 // Try to keep the same value in SOffset for adjacent loads, so that
9109 // the corresponding register contents can be re-used.
9110 //
9111 // Load values with all low-bits (except for alignment bits) set into
9112 // SOffset, so that a larger range of values can be covered using
9113 // s_movk_i32.
9114 //
9115 // Atomic operations fail to work correctly when individual address
9116 // components are unaligned, even if their sum is aligned.
9117 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9118 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9119 Imm = Low;
9120 Overflow = High - Alignment.value();
9121 }
9122 }
9123
9124 if (Overflow > 0) {
9125 // There is a hardware bug in SI and CI which prevents address clamping in
9126 // MUBUF instructions from working correctly with SOffsets. The immediate
9127 // offset is unaffected.
9129 return false;
9130
9131 // It is not possible to set immediate in SOffset field on some targets.
9132 if (ST.hasRestrictedSOffset())
9133 return false;
9134 }
9135
9136 ImmOffset = Imm;
9137 SOffset = Overflow;
9138 return true;
9139}
9140
9141// Depending on the used address space and instructions, some immediate offsets
9142// are allowed and some are not.
9143// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9144// scratch instruction offsets can also be negative. On GFX12, offsets can be
9145// negative for all variants.
9146//
9147// There are several bugs related to these offsets:
9148// On gfx10.1, flat instructions that go into the global address space cannot
9149// use an offset.
9150//
9151// For scratch instructions, the address can be either an SGPR or a VGPR.
9152// The following offsets can be used, depending on the architecture (x means
9153// cannot be used):
9154// +----------------------------+------+------+
9155// | Address-Mode | SGPR | VGPR |
9156// +----------------------------+------+------+
9157// | gfx9 | | |
9158// | negative, 4-aligned offset | x | ok |
9159// | negative, unaligned offset | x | ok |
9160// +----------------------------+------+------+
9161// | gfx10 | | |
9162// | negative, 4-aligned offset | ok | ok |
9163// | negative, unaligned offset | ok | x |
9164// +----------------------------+------+------+
9165// | gfx10.3 | | |
9166// | negative, 4-aligned offset | ok | ok |
9167// | negative, unaligned offset | ok | ok |
9168// +----------------------------+------+------+
9169//
9170// This function ignores the addressing mode, so if an offset cannot be used in
9171// one addressing mode, it is considered illegal.
9172bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9173 uint64_t FlatVariant) const {
9174 // TODO: Should 0 be special cased?
9175 if (!ST.hasFlatInstOffsets())
9176 return false;
9177
9178 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9179 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9180 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9181 return false;
9182
9184 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9185 (Offset % 4) != 0) {
9186 return false;
9187 }
9188
9189 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9190 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9191 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9192}
9193
9194// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9195std::pair<int64_t, int64_t>
9196SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9197 uint64_t FlatVariant) const {
9198 int64_t RemainderOffset = COffsetVal;
9199 int64_t ImmField = 0;
9200
9201 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9202 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9203
9204 if (AllowNegative) {
9205 // Use signed division by a power of two to truncate towards 0.
9206 int64_t D = 1LL << NumBits;
9207 RemainderOffset = (COffsetVal / D) * D;
9208 ImmField = COffsetVal - RemainderOffset;
9209
9211 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9212 (ImmField % 4) != 0) {
9213 // Make ImmField a multiple of 4
9214 RemainderOffset += ImmField % 4;
9215 ImmField -= ImmField % 4;
9216 }
9217 } else if (COffsetVal >= 0) {
9218 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9219 RemainderOffset = COffsetVal - ImmField;
9220 }
9221
9222 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9223 assert(RemainderOffset + ImmField == COffsetVal);
9224 return {ImmField, RemainderOffset};
9225}
9226
9228 if (ST.hasNegativeScratchOffsetBug() &&
9229 FlatVariant == SIInstrFlags::FlatScratch)
9230 return false;
9231
9232 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9233}
9234
9235static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9236 switch (ST.getGeneration()) {
9237 default:
9238 break;
9241 return SIEncodingFamily::SI;
9244 return SIEncodingFamily::VI;
9251 }
9252 llvm_unreachable("Unknown subtarget generation!");
9253}
9254
9255bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9256 switch(MCOp) {
9257 // These opcodes use indirect register addressing so
9258 // they need special handling by codegen (currently missing).
9259 // Therefore it is too risky to allow these opcodes
9260 // to be selected by dpp combiner or sdwa peepholer.
9261 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9262 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9263 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9264 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9265 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9266 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9267 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9268 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9269 return true;
9270 default:
9271 return false;
9272 }
9273}
9274
9275#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9276 case OPCODE##_dpp: \
9277 case OPCODE##_e32: \
9278 case OPCODE##_e64: \
9279 case OPCODE##_e64_dpp: \
9280 case OPCODE##_sdwa:
9281
9282static bool isRenamedInGFX9(int Opcode) {
9283 switch (Opcode) {
9284 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9285 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9286 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9287 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9288 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9289 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9290 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9291 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9292 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9293 //
9294 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
9295 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
9296 case AMDGPU::V_FMA_F16_gfx9_e64:
9297 case AMDGPU::V_INTERP_P2_F16:
9298 case AMDGPU::V_MAD_F16_e64:
9299 case AMDGPU::V_MAD_U16_e64:
9300 case AMDGPU::V_MAD_I16_e64:
9301 return true;
9302 default:
9303 return false;
9304 }
9305}
9306
9307int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9308 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9309
9310 unsigned Gen = subtargetEncodingFamily(ST);
9311
9314
9315 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9316 // subtarget has UnpackedD16VMem feature.
9317 // TODO: remove this when we discard GFX80 encoding.
9318 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9320
9321 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9322 switch (ST.getGeneration()) {
9323 default:
9325 break;
9328 break;
9331 break;
9332 }
9333 }
9334
9335 if (isMAI(Opcode)) {
9336 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9337 if (MFMAOp != -1)
9338 Opcode = MFMAOp;
9339 }
9340
9341 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9342
9343 // -1 means that Opcode is already a native instruction.
9344 if (MCOp == -1)
9345 return Opcode;
9346
9347 if (ST.hasGFX90AInsts()) {
9348 uint16_t NMCOp = (uint16_t)-1;
9349 if (ST.hasGFX940Insts())
9351 if (NMCOp == (uint16_t)-1)
9353 if (NMCOp == (uint16_t)-1)
9355 if (NMCOp != (uint16_t)-1)
9356 MCOp = NMCOp;
9357 }
9358
9359 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9360 // no encoding in the given subtarget generation.
9361 if (MCOp == (uint16_t)-1)
9362 return -1;
9363
9364 if (isAsmOnlyOpcode(MCOp))
9365 return -1;
9366
9367 return MCOp;
9368}
9369
9370static
9372 assert(RegOpnd.isReg());
9373 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9374 getRegSubRegPair(RegOpnd);
9375}
9376
9379 assert(MI.isRegSequence());
9380 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9381 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9382 auto &RegOp = MI.getOperand(1 + 2 * I);
9383 return getRegOrUndef(RegOp);
9384 }
9386}
9387
9388// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9389// Following a subreg of reg:subreg isn't supported
9392 if (!RSR.SubReg)
9393 return false;
9394 switch (MI.getOpcode()) {
9395 default: break;
9396 case AMDGPU::REG_SEQUENCE:
9397 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9398 return true;
9399 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9400 case AMDGPU::INSERT_SUBREG:
9401 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9402 // inserted the subreg we're looking for
9403 RSR = getRegOrUndef(MI.getOperand(2));
9404 else { // the subreg in the rest of the reg
9405 auto R1 = getRegOrUndef(MI.getOperand(1));
9406 if (R1.SubReg) // subreg of subreg isn't supported
9407 return false;
9408 RSR.Reg = R1.Reg;
9409 }
9410 return true;
9411 }
9412 return false;
9413}
9414
9417 assert(MRI.isSSA());
9418 if (!P.Reg.isVirtual())
9419 return nullptr;
9420
9421 auto RSR = P;
9422 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9423 while (auto *MI = DefInst) {
9424 DefInst = nullptr;
9425 switch (MI->getOpcode()) {
9426 case AMDGPU::COPY:
9427 case AMDGPU::V_MOV_B32_e32: {
9428 auto &Op1 = MI->getOperand(1);
9429 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9430 if (Op1.isUndef())
9431 return nullptr;
9432 RSR = getRegSubRegPair(Op1);
9433 DefInst = MRI.getVRegDef(RSR.Reg);
9434 }
9435 break;
9436 }
9437 default:
9438 if (followSubRegDef(*MI, RSR)) {
9439 if (!RSR.Reg)
9440 return nullptr;
9441 DefInst = MRI.getVRegDef(RSR.Reg);
9442 }
9443 }
9444 if (!DefInst)
9445 return MI;
9446 }
9447 return nullptr;
9448}
9449
9451 Register VReg,
9452 const MachineInstr &DefMI,
9453 const MachineInstr &UseMI) {
9454 assert(MRI.isSSA() && "Must be run on SSA");
9455
9456 auto *TRI = MRI.getTargetRegisterInfo();
9457 auto *DefBB = DefMI.getParent();
9458
9459 // Don't bother searching between blocks, although it is possible this block
9460 // doesn't modify exec.
9461 if (UseMI.getParent() != DefBB)
9462 return true;
9463
9464 const int MaxInstScan = 20;
9465 int NumInst = 0;
9466
9467 // Stop scan at the use.
9468 auto E = UseMI.getIterator();
9469 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9470 if (I->isDebugInstr())
9471 continue;
9472
9473 if (++NumInst > MaxInstScan)
9474 return true;
9475
9476 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9477 return true;
9478 }
9479
9480 return false;
9481}
9482
9484 Register VReg,
9485 const MachineInstr &DefMI) {
9486 assert(MRI.isSSA() && "Must be run on SSA");
9487
9488 auto *TRI = MRI.getTargetRegisterInfo();
9489 auto *DefBB = DefMI.getParent();
9490
9491 const int MaxUseScan = 10;
9492 int NumUse = 0;
9493
9494 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9495 auto &UseInst = *Use.getParent();
9496 // Don't bother searching between blocks, although it is possible this block
9497 // doesn't modify exec.
9498 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9499 return true;
9500
9501 if (++NumUse > MaxUseScan)
9502 return true;
9503 }
9504
9505 if (NumUse == 0)
9506 return false;
9507
9508 const int MaxInstScan = 20;
9509 int NumInst = 0;
9510
9511 // Stop scan when we have seen all the uses.
9512 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9513 assert(I != DefBB->end());
9514
9515 if (I->isDebugInstr())
9516 continue;
9517
9518 if (++NumInst > MaxInstScan)
9519 return true;
9520
9521 for (const MachineOperand &Op : I->operands()) {
9522 // We don't check reg masks here as they're used only on calls:
9523 // 1. EXEC is only considered const within one BB
9524 // 2. Call should be a terminator instruction if present in a BB
9525
9526 if (!Op.isReg())
9527 continue;
9528
9529 Register Reg = Op.getReg();
9530 if (Op.isUse()) {
9531 if (Reg == VReg && --NumUse == 0)
9532 return false;
9533 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9534 return true;
9535 }
9536 }
9537}
9538
9541 const DebugLoc &DL, Register Src, Register Dst) const {
9542 auto Cur = MBB.begin();
9543 if (Cur != MBB.end())
9544 do {
9545 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9546 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9547 ++Cur;
9548 } while (Cur != MBB.end() && Cur != LastPHIIt);
9549
9550 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9551 Dst);
9552}
9553
9556 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9557 if (InsPt != MBB.end() &&
9558 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9559 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9560 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9561 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9562 InsPt++;
9563 return BuildMI(MBB, InsPt, DL,
9564 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9565 : AMDGPU::S_MOV_B64_term),
9566 Dst)
9567 .addReg(Src, 0, SrcSubReg)
9568 .addReg(AMDGPU::EXEC, RegState::Implicit);
9569 }
9570 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9571 Dst);
9572}
9573
9574bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9575
9578 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9579 VirtRegMap *VRM) const {
9580 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9581 //
9582 // %0:sreg_32 = COPY $m0
9583 //
9584 // We explicitly chose SReg_32 for the virtual register so such a copy might
9585 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9586 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9587 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9588 // TargetInstrInfo::foldMemoryOperand() is going to try.
9589 // A similar issue also exists with spilling and reloading $exec registers.
9590 //
9591 // To prevent that, constrain the %0 register class here.
9592 if (isFullCopyInstr(MI)) {
9593 Register DstReg = MI.getOperand(0).getReg();
9594 Register SrcReg = MI.getOperand(1).getReg();
9595 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9596 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9598 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9599 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9600 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9601 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9602 return nullptr;
9603 }
9604 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9605 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9606 return nullptr;
9607 }
9608 }
9609 }
9610
9611 return nullptr;
9612}
9613
9615 const MachineInstr &MI,
9616 unsigned *PredCost) const {
9617 if (MI.isBundle()) {
9619 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9620 unsigned Lat = 0, Count = 0;
9621 for (++I; I != E && I->isBundledWithPred(); ++I) {
9622 ++Count;
9623 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9624 }
9625 return Lat + Count - 1;
9626 }
9627
9628 return SchedModel.computeInstrLatency(&MI);
9629}
9630
9633 unsigned opcode = MI.getOpcode();
9634 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9635 auto IID = GI->getIntrinsicID();
9640
9641 switch (IID) {
9642 case Intrinsic::amdgcn_if:
9643 case Intrinsic::amdgcn_else:
9644 // FIXME: Uniform if second result
9645 break;
9646 }
9647
9649 }
9650
9651 // Loads from the private and flat address spaces are divergent, because
9652 // threads can execute the load instruction with the same inputs and get
9653 // different results.
9654 //
9655 // All other loads are not divergent, because if threads issue loads with the
9656 // same arguments, they will always get the same result.
9657 if (opcode == AMDGPU::G_LOAD) {
9658 if (MI.memoperands_empty())
9659 return InstructionUniformity::NeverUniform; // conservative assumption
9660
9661 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9662 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9663 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9664 })) {
9665 // At least one MMO in a non-global address space.
9667 }
9669 }
9670
9671 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9672 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9673 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9674 AMDGPU::isGenericAtomic(opcode)) {
9676 }
9678}
9679
9682
9683 if (isNeverUniform(MI))
9685
9686 unsigned opcode = MI.getOpcode();
9687 if (opcode == AMDGPU::V_READLANE_B32 ||
9688 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9689 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9691
9692 if (isCopyInstr(MI)) {
9693 const MachineOperand &srcOp = MI.getOperand(1);
9694 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9695 const TargetRegisterClass *regClass =
9696 RI.getPhysRegBaseClass(srcOp.getReg());
9699 }
9701 }
9702
9703 // GMIR handling
9704 if (MI.isPreISelOpcode())
9706
9707 // Atomics are divergent because they are executed sequentially: when an
9708 // atomic operation refers to the same address in each thread, then each
9709 // thread after the first sees the value written by the previous thread as
9710 // original value.
9711
9712 if (isAtomic(MI))
9714
9715 // Loads from the private and flat address spaces are divergent, because
9716 // threads can execute the load instruction with the same inputs and get
9717 // different results.
9718 if (isFLAT(MI) && MI.mayLoad()) {
9719 if (MI.memoperands_empty())
9720 return InstructionUniformity::NeverUniform; // conservative assumption
9721
9722 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9723 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9724 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9725 })) {
9726 // At least one MMO in a non-global address space.
9728 }
9729
9731 }
9732
9733 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9734 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9735
9736 // FIXME: It's conceptually broken to report this for an instruction, and not
9737 // a specific def operand. For inline asm in particular, there could be mixed
9738 // uniform and divergent results.
9739 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9740 const MachineOperand &SrcOp = MI.getOperand(I);
9741 if (!SrcOp.isReg())
9742 continue;
9743
9744 Register Reg = SrcOp.getReg();
9745 if (!Reg || !SrcOp.readsReg())
9746 continue;
9747
9748 // If RegBank is null, this is unassigned or an unallocatable special
9749 // register, which are all scalars.
9750 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9751 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9753 }
9754
9755 // TODO: Uniformity check condtions above can be rearranged for more
9756 // redability
9757
9758 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9759 // currently turned into no-op COPYs by SelectionDAG ISel and are
9760 // therefore no longer recognizable.
9761
9763}
9764
9766 switch (MF.getFunction().getCallingConv()) {
9768 return 1;
9770 return 2;
9772 return 3;
9776 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9779 case CallingConv::C:
9780 case CallingConv::Fast:
9781 default:
9782 // Assume other calling conventions are various compute callable functions
9783 return 0;
9784 }
9785}
9786
9788 Register &SrcReg2, int64_t &CmpMask,
9789 int64_t &CmpValue) const {
9790 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9791 return false;
9792
9793 switch (MI.getOpcode()) {
9794 default:
9795 break;
9796 case AMDGPU::S_CMP_EQ_U32:
9797 case AMDGPU::S_CMP_EQ_I32:
9798 case AMDGPU::S_CMP_LG_U32:
9799 case AMDGPU::S_CMP_LG_I32:
9800 case AMDGPU::S_CMP_LT_U32:
9801 case AMDGPU::S_CMP_LT_I32:
9802 case AMDGPU::S_CMP_GT_U32:
9803 case AMDGPU::S_CMP_GT_I32:
9804 case AMDGPU::S_CMP_LE_U32:
9805 case AMDGPU::S_CMP_LE_I32:
9806 case AMDGPU::S_CMP_GE_U32:
9807 case AMDGPU::S_CMP_GE_I32:
9808 case AMDGPU::S_CMP_EQ_U64:
9809 case AMDGPU::S_CMP_LG_U64:
9810 SrcReg = MI.getOperand(0).getReg();
9811 if (MI.getOperand(1).isReg()) {
9812 if (MI.getOperand(1).getSubReg())
9813 return false;
9814 SrcReg2 = MI.getOperand(1).getReg();
9815 CmpValue = 0;
9816 } else if (MI.getOperand(1).isImm()) {
9817 SrcReg2 = Register();
9818 CmpValue = MI.getOperand(1).getImm();
9819 } else {
9820 return false;
9821 }
9822 CmpMask = ~0;
9823 return true;
9824 case AMDGPU::S_CMPK_EQ_U32:
9825 case AMDGPU::S_CMPK_EQ_I32:
9826 case AMDGPU::S_CMPK_LG_U32:
9827 case AMDGPU::S_CMPK_LG_I32:
9828 case AMDGPU::S_CMPK_LT_U32:
9829 case AMDGPU::S_CMPK_LT_I32:
9830 case AMDGPU::S_CMPK_GT_U32:
9831 case AMDGPU::S_CMPK_GT_I32:
9832 case AMDGPU::S_CMPK_LE_U32:
9833 case AMDGPU::S_CMPK_LE_I32:
9834 case AMDGPU::S_CMPK_GE_U32:
9835 case AMDGPU::S_CMPK_GE_I32:
9836 SrcReg = MI.getOperand(0).getReg();
9837 SrcReg2 = Register();
9838 CmpValue = MI.getOperand(1).getImm();
9839 CmpMask = ~0;
9840 return true;
9841 }
9842
9843 return false;
9844}
9845
9847 Register SrcReg2, int64_t CmpMask,
9848 int64_t CmpValue,
9849 const MachineRegisterInfo *MRI) const {
9850 if (!SrcReg || SrcReg.isPhysical())
9851 return false;
9852
9853 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9854 return false;
9855
9856 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9857 this](int64_t ExpectedValue, unsigned SrcSize,
9858 bool IsReversible, bool IsSigned) -> bool {
9859 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9860 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9861 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9862 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9863 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9864 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9865 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9866 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9867 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9868 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9869 //
9870 // Signed ge/gt are not used for the sign bit.
9871 //
9872 // If result of the AND is unused except in the compare:
9873 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9874 //
9875 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9876 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9877 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9878 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9879 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9880 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9881
9882 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9883 if (!Def || Def->getParent() != CmpInstr.getParent())
9884 return false;
9885
9886 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9887 Def->getOpcode() != AMDGPU::S_AND_B64)
9888 return false;
9889
9890 int64_t Mask;
9891 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9892 if (MO->isImm())
9893 Mask = MO->getImm();
9894 else if (!getFoldableImm(MO, Mask))
9895 return false;
9896 Mask &= maxUIntN(SrcSize);
9897 return isPowerOf2_64(Mask);
9898 };
9899
9900 MachineOperand *SrcOp = &Def->getOperand(1);
9901 if (isMask(SrcOp))
9902 SrcOp = &Def->getOperand(2);
9903 else if (isMask(&Def->getOperand(2)))
9904 SrcOp = &Def->getOperand(1);
9905 else
9906 return false;
9907
9908 // A valid Mask is required to have a single bit set, hence a non-zero and
9909 // power-of-two value. This verifies that we will not do 64-bit shift below.
9910 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
9911 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
9912 if (IsSigned && BitNo == SrcSize - 1)
9913 return false;
9914
9915 ExpectedValue <<= BitNo;
9916
9917 bool IsReversedCC = false;
9918 if (CmpValue != ExpectedValue) {
9919 if (!IsReversible)
9920 return false;
9921 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
9922 if (!IsReversedCC)
9923 return false;
9924 }
9925
9926 Register DefReg = Def->getOperand(0).getReg();
9927 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
9928 return false;
9929
9930 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
9931 I != E; ++I) {
9932 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
9933 I->killsRegister(AMDGPU::SCC, &RI))
9934 return false;
9935 }
9936
9937 MachineOperand *SccDef =
9938 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
9939 SccDef->setIsDead(false);
9940 CmpInstr.eraseFromParent();
9941
9942 if (!MRI->use_nodbg_empty(DefReg)) {
9943 assert(!IsReversedCC);
9944 return true;
9945 }
9946
9947 // Replace AND with unused result with a S_BITCMP.
9948 MachineBasicBlock *MBB = Def->getParent();
9949
9950 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
9951 : AMDGPU::S_BITCMP1_B32
9952 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
9953 : AMDGPU::S_BITCMP1_B64;
9954
9955 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
9956 .add(*SrcOp)
9957 .addImm(BitNo);
9958 Def->eraseFromParent();
9959
9960 return true;
9961 };
9962
9963 switch (CmpInstr.getOpcode()) {
9964 default:
9965 break;
9966 case AMDGPU::S_CMP_EQ_U32:
9967 case AMDGPU::S_CMP_EQ_I32:
9968 case AMDGPU::S_CMPK_EQ_U32:
9969 case AMDGPU::S_CMPK_EQ_I32:
9970 return optimizeCmpAnd(1, 32, true, false);
9971 case AMDGPU::S_CMP_GE_U32:
9972 case AMDGPU::S_CMPK_GE_U32:
9973 return optimizeCmpAnd(1, 32, false, false);
9974 case AMDGPU::S_CMP_GE_I32:
9975 case AMDGPU::S_CMPK_GE_I32:
9976 return optimizeCmpAnd(1, 32, false, true);
9977 case AMDGPU::S_CMP_EQ_U64:
9978 return optimizeCmpAnd(1, 64, true, false);
9979 case AMDGPU::S_CMP_LG_U32:
9980 case AMDGPU::S_CMP_LG_I32:
9981 case AMDGPU::S_CMPK_LG_U32:
9982 case AMDGPU::S_CMPK_LG_I32:
9983 return optimizeCmpAnd(0, 32, true, false);
9984 case AMDGPU::S_CMP_GT_U32:
9985 case AMDGPU::S_CMPK_GT_U32:
9986 return optimizeCmpAnd(0, 32, false, false);
9987 case AMDGPU::S_CMP_GT_I32:
9988 case AMDGPU::S_CMPK_GT_I32:
9989 return optimizeCmpAnd(0, 32, false, true);
9990 case AMDGPU::S_CMP_LG_U64:
9991 return optimizeCmpAnd(0, 64, true, false);
9992 }
9993
9994 return false;
9995}
9996
9998 unsigned OpName) const {
9999 if (!ST.needsAlignedVGPRs())
10000 return;
10001
10002 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10003 if (OpNo < 0)
10004 return;
10005 MachineOperand &Op = MI.getOperand(OpNo);
10006 if (getOpSize(MI, OpNo) > 4)
10007 return;
10008
10009 // Add implicit aligned super-reg to force alignment on the data operand.
10010 const DebugLoc &DL = MI.getDebugLoc();
10011 MachineBasicBlock *BB = MI.getParent();
10013 Register DataReg = Op.getReg();
10014 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10015 Register Undef = MRI.createVirtualRegister(
10016 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10017 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10018 Register NewVR =
10019 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10020 : &AMDGPU::VReg_64_Align2RegClass);
10021 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10022 .addReg(DataReg, 0, Op.getSubReg())
10023 .addImm(AMDGPU::sub0)
10024 .addReg(Undef)
10025 .addImm(AMDGPU::sub1);
10026 Op.setReg(NewVR);
10027 Op.setSubReg(AMDGPU::sub0);
10028 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10029}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
TargetInstrInfo::RegSubRegPair RegSubRegPair
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillRestoreOpcode(unsigned Size)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillSaveOpcode(unsigned Size)
static bool resultDependsOnExec(const MachineInstr &MI)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:82
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static constexpr unsigned ModifierOpNames[]
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:73
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:171
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:759
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:763
bool hasScalarCompareEq64() const
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:401
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasVALUMaskWriteHazard() const
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:313
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:775
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:694
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:767
bool hasVALUReadSGPRHazard() const
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:354
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasNoF16PseudoScalarTransInlineConstants() const
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:946
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:687
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:542
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:612
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:622
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:222
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:444
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:438
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:248
unsigned short Opcode
Definition: MCInstrDesc.h:205
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:565
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:97
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:91
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:398
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:47
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:578
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
iterator_range< mop_iterator > explicit_operands()
Definition: MachineInstr.h:697
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:821
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:806
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:788
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:499
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:705
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:392
int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:798
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:560
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:510
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:930
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1164
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:642
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:550
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1294
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:542
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:655
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:414
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:502
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:518
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:610
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:526
void removeModOperands(MachineInstr &MI) const
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:594
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:454
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:576
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:634
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:602
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:430
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:470
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:971
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:534
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:624
static bool isF16PseudoScalarTrans(unsigned Opcode)
Definition: SIInstrInfo.h:955
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:766
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void materializeImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, int64_t Value) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:722
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
static bool isWWMRegSpillOpcode(uint16_t Opcode)
Definition: SIInstrInfo.h:754
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:1017
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:586
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description.
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:687
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:869
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:734
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:815
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:462
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:422
bool isBarrier(unsigned Opcode) const
Definition: SIInstrInfo.h:945
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1307
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:886
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:568
unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:494
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:63
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
bool isVectorRegister(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:65
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:237
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1582
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isDPALU_DPP(const MCInstrDesc &OpDesc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1583
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1585
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2INT32
Definition: SIDefines.h:227
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_AC_V2FP16
Definition: SIDefines.h:246
@ OPERAND_REG_IMM_INT32
Operands with register or 32-bit immediate.
Definition: SIDefines.h:200
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16_DEFERRED
Definition: SIDefines.h:207
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:206
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_AC_BF16
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_AC_INT16
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:204
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_AC_FP16
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_AC_INT32
Definition: SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:242
@ OPERAND_REG_INLINE_AC_V2BF16
Definition: SIDefines.h:245
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:213
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:203
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:251
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:243
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:202
@ OPERAND_REG_INLINE_C_V2FP32
Definition: SIDefines.h:228
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_FP32_DEFERRED
Definition: SIDefines.h:209
@ OPERAND_REG_IMM_FP16_DEFERRED
Definition: SIDefines.h:208
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:466
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:468
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:465
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:467
@ TI_CONSTDATA_START
Definition: AMDGPU.h:464
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1584
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:70
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:68
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:67
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:69
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:60
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:59
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:66
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:71
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1468
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:296
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:43
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:260
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition: SIInstrInfo.h:39
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Error
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:219
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:218
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:83
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:51
MachineInstr * top() const
Definition: SIInstrInfo.h:56
bool empty() const
Definition: SIInstrInfo.h:66
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:75
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.