LLVM 20.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm::AMDGPU {
44#define GET_D16ImageDimIntrinsics_IMPL
45#define GET_ImageDimIntrinsicTable_IMPL
46#define GET_RsrcIntrinsics_IMPL
47#include "AMDGPUGenSearchableTables.inc"
48} // namespace llvm::AMDGPU
49
50// Must be at least 4 to be able to branch over minimum unconditional branch
51// code. This is only for making it possible to write reasonably small tests for
52// long branches.
54BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
55 cl::desc("Restrict range of branch instructions (DEBUG)"));
56
58 "amdgpu-fix-16-bit-physreg-copies",
59 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
60 cl::init(true),
62
64 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
65 RI(ST), ST(ST) {
66 SchedModel.init(&ST);
67}
68
69//===----------------------------------------------------------------------===//
70// TargetInstrInfo callbacks
71//===----------------------------------------------------------------------===//
72
73static unsigned getNumOperandsNoGlue(SDNode *Node) {
74 unsigned N = Node->getNumOperands();
75 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
76 --N;
77 return N;
78}
79
80/// Returns true if both nodes have the same value for the given
81/// operand \p Op, or if both nodes do not have this operand.
82static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
83 unsigned Opc0 = N0->getMachineOpcode();
84 unsigned Opc1 = N1->getMachineOpcode();
85
86 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
87 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
88
89 if (Op0Idx == -1 && Op1Idx == -1)
90 return true;
91
92
93 if ((Op0Idx == -1 && Op1Idx != -1) ||
94 (Op1Idx == -1 && Op0Idx != -1))
95 return false;
96
97 // getNamedOperandIdx returns the index for the MachineInstr's operands,
98 // which includes the result as the first operand. We are indexing into the
99 // MachineSDNode's operands, so we need to skip the result operand to get
100 // the real index.
101 --Op0Idx;
102 --Op1Idx;
103
104 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
105}
106
107static bool canRemat(const MachineInstr &MI) {
108
112 return true;
113
114 if (SIInstrInfo::isSMRD(MI)) {
115 return !MI.memoperands_empty() &&
116 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
117 return MMO->isLoad() && MMO->isInvariant();
118 });
119 }
120
121 return false;
122}
123
125 const MachineInstr &MI) const {
126
127 if (canRemat(MI)) {
128 // Normally VALU use of exec would block the rematerialization, but that
129 // is OK in this case to have an implicit exec read as all VALU do.
130 // We really want all of the generic logic for this except for this.
131
132 // Another potential implicit use is mode register. The core logic of
133 // the RA will not attempt rematerialization if mode is set anywhere
134 // in the function, otherwise it is safe since mode is not changed.
135
136 // There is difference to generic method which does not allow
137 // rematerialization if there are virtual register uses. We allow this,
138 // therefore this method includes SOP instructions as well.
139 if (!MI.hasImplicitDef() &&
140 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
141 !MI.mayRaiseFPException())
142 return true;
143 }
144
146}
147
148// Returns true if the scalar result of a VALU instruction depends on exec.
150 // Ignore comparisons which are only used masked with exec.
151 // This allows some hoisting/sinking of VALU comparisons.
152 if (MI.isCompare()) {
153 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
154 Register DstReg = MI.getOperand(0).getReg();
155 if (!DstReg.isVirtual())
156 return true;
157 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
158 switch (Use.getOpcode()) {
159 case AMDGPU::S_AND_SAVEEXEC_B32:
160 case AMDGPU::S_AND_SAVEEXEC_B64:
161 break;
162 case AMDGPU::S_AND_B32:
163 case AMDGPU::S_AND_B64:
164 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
165 return true;
166 break;
167 default:
168 return true;
169 }
170 }
171 return false;
172 }
173
174 switch (MI.getOpcode()) {
175 default:
176 break;
177 case AMDGPU::V_READFIRSTLANE_B32:
178 return true;
179 }
180
181 return false;
182}
183
185 // Any implicit use of exec by VALU is not a real register read.
186 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
188}
189
191 MachineBasicBlock *SuccToSinkTo,
192 MachineCycleInfo *CI) const {
193 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
194 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
195 return true;
196
197 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
198 // Check if sinking of MI would create temporal divergent use.
199 for (auto Op : MI.uses()) {
200 if (Op.isReg() && Op.getReg().isVirtual() &&
201 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
202 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
203
204 // SgprDef defined inside cycle
205 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
206 if (FromCycle == nullptr)
207 continue;
208
209 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
210 // Check if there is a FromCycle that contains SgprDef's basic block but
211 // does not contain SuccToSinkTo and also has divergent exit condition.
212 while (FromCycle && !FromCycle->contains(ToCycle)) {
214 FromCycle->getExitingBlocks(ExitingBlocks);
215
216 // FromCycle has divergent exit condition.
217 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
218 if (hasDivergentBranch(ExitingBlock))
219 return false;
220 }
221
222 FromCycle = FromCycle->getParentCycle();
223 }
224 }
225 }
226
227 return true;
228}
229
231 int64_t &Offset0,
232 int64_t &Offset1) const {
233 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
234 return false;
235
236 unsigned Opc0 = Load0->getMachineOpcode();
237 unsigned Opc1 = Load1->getMachineOpcode();
238
239 // Make sure both are actually loads.
240 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
241 return false;
242
243 // A mayLoad instruction without a def is not a load. Likely a prefetch.
244 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
245 return false;
246
247 if (isDS(Opc0) && isDS(Opc1)) {
248
249 // FIXME: Handle this case:
250 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
251 return false;
252
253 // Check base reg.
254 if (Load0->getOperand(0) != Load1->getOperand(0))
255 return false;
256
257 // Skip read2 / write2 variants for simplicity.
258 // TODO: We should report true if the used offsets are adjacent (excluded
259 // st64 versions).
260 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
261 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
262 if (Offset0Idx == -1 || Offset1Idx == -1)
263 return false;
264
265 // XXX - be careful of dataless loads
266 // getNamedOperandIdx returns the index for MachineInstrs. Since they
267 // include the output in the operand list, but SDNodes don't, we need to
268 // subtract the index by one.
269 Offset0Idx -= get(Opc0).NumDefs;
270 Offset1Idx -= get(Opc1).NumDefs;
271 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
272 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
273 return true;
274 }
275
276 if (isSMRD(Opc0) && isSMRD(Opc1)) {
277 // Skip time and cache invalidation instructions.
278 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
279 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
280 return false;
281
282 unsigned NumOps = getNumOperandsNoGlue(Load0);
283 if (NumOps != getNumOperandsNoGlue(Load1))
284 return false;
285
286 // Check base reg.
287 if (Load0->getOperand(0) != Load1->getOperand(0))
288 return false;
289
290 // Match register offsets, if both register and immediate offsets present.
291 assert(NumOps == 4 || NumOps == 5);
292 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
293 return false;
294
295 const ConstantSDNode *Load0Offset =
296 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
297 const ConstantSDNode *Load1Offset =
298 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
299
300 if (!Load0Offset || !Load1Offset)
301 return false;
302
303 Offset0 = Load0Offset->getZExtValue();
304 Offset1 = Load1Offset->getZExtValue();
305 return true;
306 }
307
308 // MUBUF and MTBUF can access the same addresses.
309 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
310
311 // MUBUF and MTBUF have vaddr at different indices.
312 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
313 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
314 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
315 return false;
316
317 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
318 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
319
320 if (OffIdx0 == -1 || OffIdx1 == -1)
321 return false;
322
323 // getNamedOperandIdx returns the index for MachineInstrs. Since they
324 // include the output in the operand list, but SDNodes don't, we need to
325 // subtract the index by one.
326 OffIdx0 -= get(Opc0).NumDefs;
327 OffIdx1 -= get(Opc1).NumDefs;
328
329 SDValue Off0 = Load0->getOperand(OffIdx0);
330 SDValue Off1 = Load1->getOperand(OffIdx1);
331
332 // The offset might be a FrameIndexSDNode.
333 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
334 return false;
335
336 Offset0 = Off0->getAsZExtVal();
337 Offset1 = Off1->getAsZExtVal();
338 return true;
339 }
340
341 return false;
342}
343
344static bool isStride64(unsigned Opc) {
345 switch (Opc) {
346 case AMDGPU::DS_READ2ST64_B32:
347 case AMDGPU::DS_READ2ST64_B64:
348 case AMDGPU::DS_WRITE2ST64_B32:
349 case AMDGPU::DS_WRITE2ST64_B64:
350 return true;
351 default:
352 return false;
353 }
354}
355
358 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
359 const TargetRegisterInfo *TRI) const {
360 if (!LdSt.mayLoadOrStore())
361 return false;
362
363 unsigned Opc = LdSt.getOpcode();
364 OffsetIsScalable = false;
365 const MachineOperand *BaseOp, *OffsetOp;
366 int DataOpIdx;
367
368 if (isDS(LdSt)) {
369 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
370 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
371 if (OffsetOp) {
372 // Normal, single offset LDS instruction.
373 if (!BaseOp) {
374 // DS_CONSUME/DS_APPEND use M0 for the base address.
375 // TODO: find the implicit use operand for M0 and use that as BaseOp?
376 return false;
377 }
378 BaseOps.push_back(BaseOp);
379 Offset = OffsetOp->getImm();
380 // Get appropriate operand, and compute width accordingly.
381 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
382 if (DataOpIdx == -1)
383 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
384 Width = getOpSize(LdSt, DataOpIdx);
385 } else {
386 // The 2 offset instructions use offset0 and offset1 instead. We can treat
387 // these as a load with a single offset if the 2 offsets are consecutive.
388 // We will use this for some partially aligned loads.
389 const MachineOperand *Offset0Op =
390 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
391 const MachineOperand *Offset1Op =
392 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
393
394 unsigned Offset0 = Offset0Op->getImm() & 0xff;
395 unsigned Offset1 = Offset1Op->getImm() & 0xff;
396 if (Offset0 + 1 != Offset1)
397 return false;
398
399 // Each of these offsets is in element sized units, so we need to convert
400 // to bytes of the individual reads.
401
402 unsigned EltSize;
403 if (LdSt.mayLoad())
404 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
405 else {
406 assert(LdSt.mayStore());
407 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
408 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
409 }
410
411 if (isStride64(Opc))
412 EltSize *= 64;
413
414 BaseOps.push_back(BaseOp);
415 Offset = EltSize * Offset0;
416 // Get appropriate operand(s), and compute width accordingly.
417 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
418 if (DataOpIdx == -1) {
419 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 Width = getOpSize(LdSt, DataOpIdx);
421 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
422 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
423 } else {
424 Width = getOpSize(LdSt, DataOpIdx);
425 }
426 }
427 return true;
428 }
429
430 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
431 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
432 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
433 return false;
434 BaseOps.push_back(RSrc);
435 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
436 if (BaseOp && !BaseOp->isFI())
437 BaseOps.push_back(BaseOp);
438 const MachineOperand *OffsetImm =
439 getNamedOperand(LdSt, AMDGPU::OpName::offset);
440 Offset = OffsetImm->getImm();
441 const MachineOperand *SOffset =
442 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
443 if (SOffset) {
444 if (SOffset->isReg())
445 BaseOps.push_back(SOffset);
446 else
447 Offset += SOffset->getImm();
448 }
449 // Get appropriate operand, and compute width accordingly.
450 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
451 if (DataOpIdx == -1)
452 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
453 if (DataOpIdx == -1) // LDS DMA
454 return false;
455 Width = getOpSize(LdSt, DataOpIdx);
456 return true;
457 }
458
459 if (isImage(LdSt)) {
460 auto RsrcOpName =
461 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
462 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
463 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
464 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
465 if (VAddr0Idx >= 0) {
466 // GFX10 possible NSA encoding.
467 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
468 BaseOps.push_back(&LdSt.getOperand(I));
469 } else {
470 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
471 }
472 Offset = 0;
473 // Get appropriate operand, and compute width accordingly.
474 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
475 if (DataOpIdx == -1)
476 return false; // no return sampler
477 Width = getOpSize(LdSt, DataOpIdx);
478 return true;
479 }
480
481 if (isSMRD(LdSt)) {
482 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
483 if (!BaseOp) // e.g. S_MEMTIME
484 return false;
485 BaseOps.push_back(BaseOp);
486 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
487 Offset = OffsetOp ? OffsetOp->getImm() : 0;
488 // Get appropriate operand, and compute width accordingly.
489 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
490 if (DataOpIdx == -1)
491 return false;
492 Width = getOpSize(LdSt, DataOpIdx);
493 return true;
494 }
495
496 if (isFLAT(LdSt)) {
497 // Instructions have either vaddr or saddr or both or none.
498 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
499 if (BaseOp)
500 BaseOps.push_back(BaseOp);
501 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
502 if (BaseOp)
503 BaseOps.push_back(BaseOp);
504 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
505 // Get appropriate operand, and compute width accordingly.
506 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
507 if (DataOpIdx == -1)
508 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
509 if (DataOpIdx == -1) // LDS DMA
510 return false;
511 Width = getOpSize(LdSt, DataOpIdx);
512 return true;
513 }
514
515 return false;
516}
517
518static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
520 const MachineInstr &MI2,
522 // Only examine the first "base" operand of each instruction, on the
523 // assumption that it represents the real base address of the memory access.
524 // Other operands are typically offsets or indices from this base address.
525 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
526 return true;
527
528 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
529 return false;
530
531 auto *MO1 = *MI1.memoperands_begin();
532 auto *MO2 = *MI2.memoperands_begin();
533 if (MO1->getAddrSpace() != MO2->getAddrSpace())
534 return false;
535
536 const auto *Base1 = MO1->getValue();
537 const auto *Base2 = MO2->getValue();
538 if (!Base1 || !Base2)
539 return false;
540 Base1 = getUnderlyingObject(Base1);
541 Base2 = getUnderlyingObject(Base2);
542
543 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
544 return false;
545
546 return Base1 == Base2;
547}
548
550 int64_t Offset1, bool OffsetIsScalable1,
552 int64_t Offset2, bool OffsetIsScalable2,
553 unsigned ClusterSize,
554 unsigned NumBytes) const {
555 // If the mem ops (to be clustered) do not have the same base ptr, then they
556 // should not be clustered
557 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
558 if (!BaseOps1.empty() && !BaseOps2.empty()) {
559 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
560 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
561 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
562 return false;
563
564 const SIMachineFunctionInfo *MFI =
565 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
566 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
567 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
568 // If only one base op is empty, they do not have the same base ptr
569 return false;
570 }
571
572 // In order to avoid register pressure, on an average, the number of DWORDS
573 // loaded together by all clustered mem ops should not exceed
574 // MaxMemoryClusterDWords. This is an empirical value based on certain
575 // observations and performance related experiments.
576 // The good thing about this heuristic is - it avoids clustering of too many
577 // sub-word loads, and also avoids clustering of wide loads. Below is the
578 // brief summary of how the heuristic behaves for various `LoadSize` when
579 // MaxMemoryClusterDWords is 8.
580 //
581 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
582 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
583 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
584 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
585 // (5) LoadSize >= 17: do not cluster
586 const unsigned LoadSize = NumBytes / ClusterSize;
587 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
588 return NumDWords <= MaxMemoryClusterDWords;
589}
590
591// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
592// the first 16 loads will be interleaved with the stores, and the next 16 will
593// be clustered as expected. It should really split into 2 16 store batches.
594//
595// Loads are clustered until this returns false, rather than trying to schedule
596// groups of stores. This also means we have to deal with saying different
597// address space loads should be clustered, and ones which might cause bank
598// conflicts.
599//
600// This might be deprecated so it might not be worth that much effort to fix.
602 int64_t Offset0, int64_t Offset1,
603 unsigned NumLoads) const {
604 assert(Offset1 > Offset0 &&
605 "Second offset should be larger than first offset!");
606 // If we have less than 16 loads in a row, and the offsets are within 64
607 // bytes, then schedule together.
608
609 // A cacheline is 64 bytes (for global memory).
610 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
611}
612
615 const DebugLoc &DL, MCRegister DestReg,
616 MCRegister SrcReg, bool KillSrc,
617 const char *Msg = "illegal VGPR to SGPR copy") {
619 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
621 C.diagnose(IllegalCopy);
622
623 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
624 .addReg(SrcReg, getKillRegState(KillSrc));
625}
626
627/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
628/// possible to have a direct copy in these cases on GFX908, so an intermediate
629/// VGPR copy is required.
633 const DebugLoc &DL, MCRegister DestReg,
634 MCRegister SrcReg, bool KillSrc,
635 RegScavenger &RS, bool RegsOverlap,
636 Register ImpDefSuperReg = Register(),
637 Register ImpUseSuperReg = Register()) {
638 assert((TII.getSubtarget().hasMAIInsts() &&
639 !TII.getSubtarget().hasGFX90AInsts()) &&
640 "Expected GFX908 subtarget.");
641
642 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
643 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
644 "Source register of the copy should be either an SGPR or an AGPR.");
645
646 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
647 "Destination register of the copy should be an AGPR.");
648
649 const SIRegisterInfo &RI = TII.getRegisterInfo();
650
651 // First try to find defining accvgpr_write to avoid temporary registers.
652 // In the case of copies of overlapping AGPRs, we conservatively do not
653 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
654 // an accvgpr_write used for this same copy due to implicit-defs
655 if (!RegsOverlap) {
656 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
657 --Def;
658
659 if (!Def->modifiesRegister(SrcReg, &RI))
660 continue;
661
662 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
663 Def->getOperand(0).getReg() != SrcReg)
664 break;
665
666 MachineOperand &DefOp = Def->getOperand(1);
667 assert(DefOp.isReg() || DefOp.isImm());
668
669 if (DefOp.isReg()) {
670 bool SafeToPropagate = true;
671 // Check that register source operand is not clobbered before MI.
672 // Immediate operands are always safe to propagate.
673 for (auto I = Def; I != MI && SafeToPropagate; ++I)
674 if (I->modifiesRegister(DefOp.getReg(), &RI))
675 SafeToPropagate = false;
676
677 if (!SafeToPropagate)
678 break;
679
680 DefOp.setIsKill(false);
681 }
682
683 MachineInstrBuilder Builder =
684 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
685 .add(DefOp);
686 if (ImpDefSuperReg)
687 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
688
689 if (ImpUseSuperReg) {
690 Builder.addReg(ImpUseSuperReg,
692 }
693
694 return;
695 }
696 }
697
699 RS.backward(std::next(MI));
700
701 // Ideally we want to have three registers for a long reg_sequence copy
702 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
703 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
704 *MBB.getParent());
705
706 // Registers in the sequence are allocated contiguously so we can just
707 // use register number to pick one of three round-robin temps.
708 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
709 Register Tmp =
710 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
712 "VGPR used for an intermediate copy should have been reserved.");
713
714 // Only loop through if there are any free registers left. We don't want to
715 // spill.
716 while (RegNo--) {
717 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
718 /* RestoreAfter */ false, 0,
719 /* AllowSpill */ false);
720 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
721 break;
722 Tmp = Tmp2;
723 RS.setRegUsed(Tmp);
724 }
725
726 // Insert copy to temporary VGPR.
727 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
728 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
729 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
730 } else {
731 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
732 }
733
734 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
735 .addReg(SrcReg, getKillRegState(KillSrc));
736 if (ImpUseSuperReg) {
737 UseBuilder.addReg(ImpUseSuperReg,
739 }
740
741 MachineInstrBuilder DefBuilder
742 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
743 .addReg(Tmp, RegState::Kill);
744
745 if (ImpDefSuperReg)
746 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
747}
748
751 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
752 const TargetRegisterClass *RC, bool Forward) {
753 const SIRegisterInfo &RI = TII.getRegisterInfo();
754 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
756 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
757
758 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
759 int16_t SubIdx = BaseIndices[Idx];
760 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
761 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
762 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
763 unsigned Opcode = AMDGPU::S_MOV_B32;
764
765 // Is SGPR aligned? If so try to combine with next.
766 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
767 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
768 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
769 // Can use SGPR64 copy
770 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
771 SubIdx = RI.getSubRegFromChannel(Channel, 2);
772 DestSubReg = RI.getSubReg(DestReg, SubIdx);
773 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
774 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
775 Opcode = AMDGPU::S_MOV_B64;
776 Idx++;
777 }
778
779 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
780 .addReg(SrcSubReg)
781 .addReg(SrcReg, RegState::Implicit);
782
783 if (!FirstMI)
784 FirstMI = LastMI;
785
786 if (!Forward)
787 I--;
788 }
789
790 assert(FirstMI && LastMI);
791 if (!Forward)
792 std::swap(FirstMI, LastMI);
793
794 FirstMI->addOperand(
795 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
796
797 if (KillSrc)
798 LastMI->addRegisterKilled(SrcReg, &RI);
799}
800
803 const DebugLoc &DL, MCRegister DestReg,
804 MCRegister SrcReg, bool KillSrc,
805 bool RenamableDest, bool RenamableSrc) const {
806 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
807 unsigned Size = RI.getRegSizeInBits(*RC);
808 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
809 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
810
811 // The rest of copyPhysReg assumes Src and Dst size are the same size.
812 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
813 // we remove Fix16BitCopies and this code block?
814 if (Fix16BitCopies) {
815 if (((Size == 16) != (SrcSize == 16))) {
816 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
818 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
819 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
820 RegToFix = SubReg;
821
822 if (DestReg == SrcReg) {
823 // Identity copy. Insert empty bundle since ExpandPostRA expects an
824 // instruction here.
825 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
826 return;
827 }
828 RC = RI.getPhysRegBaseClass(DestReg);
829 Size = RI.getRegSizeInBits(*RC);
830 SrcRC = RI.getPhysRegBaseClass(SrcReg);
831 SrcSize = RI.getRegSizeInBits(*SrcRC);
832 }
833 }
834
835 if (RC == &AMDGPU::VGPR_32RegClass) {
836 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
837 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
838 AMDGPU::AGPR_32RegClass.contains(SrcReg));
839 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
840 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
841 BuildMI(MBB, MI, DL, get(Opc), DestReg)
842 .addReg(SrcReg, getKillRegState(KillSrc));
843 return;
844 }
845
846 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
847 RC == &AMDGPU::SReg_32RegClass) {
848 if (SrcReg == AMDGPU::SCC) {
849 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
850 .addImm(1)
851 .addImm(0);
852 return;
853 }
854
855 if (DestReg == AMDGPU::VCC_LO) {
856 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
857 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
858 .addReg(SrcReg, getKillRegState(KillSrc));
859 } else {
860 // FIXME: Hack until VReg_1 removed.
861 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
862 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
863 .addImm(0)
864 .addReg(SrcReg, getKillRegState(KillSrc));
865 }
866
867 return;
868 }
869
870 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
871 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
872 return;
873 }
874
875 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
876 .addReg(SrcReg, getKillRegState(KillSrc));
877 return;
878 }
879
880 if (RC == &AMDGPU::SReg_64RegClass) {
881 if (SrcReg == AMDGPU::SCC) {
882 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
883 .addImm(1)
884 .addImm(0);
885 return;
886 }
887
888 if (DestReg == AMDGPU::VCC) {
889 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
890 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
891 .addReg(SrcReg, getKillRegState(KillSrc));
892 } else {
893 // FIXME: Hack until VReg_1 removed.
894 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
895 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
896 .addImm(0)
897 .addReg(SrcReg, getKillRegState(KillSrc));
898 }
899
900 return;
901 }
902
903 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
904 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
905 return;
906 }
907
908 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
909 .addReg(SrcReg, getKillRegState(KillSrc));
910 return;
911 }
912
913 if (DestReg == AMDGPU::SCC) {
914 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
915 // but SelectionDAG emits such copies for i1 sources.
916 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
917 // This copy can only be produced by patterns
918 // with explicit SCC, which are known to be enabled
919 // only for subtargets with S_CMP_LG_U64 present.
921 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
922 .addReg(SrcReg, getKillRegState(KillSrc))
923 .addImm(0);
924 } else {
925 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
926 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
927 .addReg(SrcReg, getKillRegState(KillSrc))
928 .addImm(0);
929 }
930
931 return;
932 }
933
934 if (RC == &AMDGPU::AGPR_32RegClass) {
935 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
936 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
937 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
938 .addReg(SrcReg, getKillRegState(KillSrc));
939 return;
940 }
941
942 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
943 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
944 .addReg(SrcReg, getKillRegState(KillSrc));
945 return;
946 }
947
948 // FIXME: Pass should maintain scavenger to avoid scan through the block on
949 // every AGPR spill.
950 RegScavenger RS;
951 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
952 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
953 return;
954 }
955
956 if (Size == 16) {
957 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
958 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
959 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
960
961 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
962 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
963 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
964 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
965 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
966 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
967 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
968 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
969
970 if (IsSGPRDst) {
971 if (!IsSGPRSrc) {
972 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
973 return;
974 }
975
976 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
977 .addReg(NewSrcReg, getKillRegState(KillSrc));
978 return;
979 }
980
981 if (IsAGPRDst || IsAGPRSrc) {
982 if (!DstLow || !SrcLow) {
983 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
984 "Cannot use hi16 subreg with an AGPR!");
985 }
986
987 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
988 return;
989 }
990
991 if (ST.hasTrue16BitInsts()) {
992 if (IsSGPRSrc) {
993 assert(SrcLow);
994 SrcReg = NewSrcReg;
995 }
996 // Use the smaller instruction encoding if possible.
997 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
998 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
999 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1000 .addReg(SrcReg);
1001 } else {
1002 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1003 .addImm(0) // src0_modifiers
1004 .addReg(SrcReg)
1005 .addImm(0); // op_sel
1006 }
1007 return;
1008 }
1009
1010 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1011 if (!DstLow || !SrcLow) {
1012 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1013 "Cannot use hi16 subreg on VI!");
1014 }
1015
1016 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1017 .addReg(NewSrcReg, getKillRegState(KillSrc));
1018 return;
1019 }
1020
1021 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1022 .addImm(0) // src0_modifiers
1023 .addReg(NewSrcReg)
1024 .addImm(0) // clamp
1031 // First implicit operand is $exec.
1032 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1033 return;
1034 }
1035
1036 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1037 if (ST.hasMovB64()) {
1038 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1039 .addReg(SrcReg, getKillRegState(KillSrc));
1040 return;
1041 }
1042 if (ST.hasPkMovB32()) {
1043 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1045 .addReg(SrcReg)
1047 .addReg(SrcReg)
1048 .addImm(0) // op_sel_lo
1049 .addImm(0) // op_sel_hi
1050 .addImm(0) // neg_lo
1051 .addImm(0) // neg_hi
1052 .addImm(0) // clamp
1053 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1054 return;
1055 }
1056 }
1057
1058 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1059 if (RI.isSGPRClass(RC)) {
1060 if (!RI.isSGPRClass(SrcRC)) {
1061 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1062 return;
1063 }
1064 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1065 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1066 Forward);
1067 return;
1068 }
1069
1070 unsigned EltSize = 4;
1071 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1072 if (RI.isAGPRClass(RC)) {
1073 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1074 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1075 else if (RI.hasVGPRs(SrcRC) ||
1076 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1077 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1078 else
1079 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1080 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1081 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1082 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1083 (RI.isProperlyAlignedRC(*RC) &&
1084 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1085 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1086 if (ST.hasMovB64()) {
1087 Opcode = AMDGPU::V_MOV_B64_e32;
1088 EltSize = 8;
1089 } else if (ST.hasPkMovB32()) {
1090 Opcode = AMDGPU::V_PK_MOV_B32;
1091 EltSize = 8;
1092 }
1093 }
1094
1095 // For the cases where we need an intermediate instruction/temporary register
1096 // (destination is an AGPR), we need a scavenger.
1097 //
1098 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1099 // whole block for every handled copy.
1100 std::unique_ptr<RegScavenger> RS;
1101 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1102 RS = std::make_unique<RegScavenger>();
1103
1104 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1105
1106 // If there is an overlap, we can't kill the super-register on the last
1107 // instruction, since it will also kill the components made live by this def.
1108 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1109 const bool CanKillSuperReg = KillSrc && !Overlap;
1110
1111 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1112 unsigned SubIdx;
1113 if (Forward)
1114 SubIdx = SubIndices[Idx];
1115 else
1116 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1117 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1118 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1119 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1120
1121 bool IsFirstSubreg = Idx == 0;
1122 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1123
1124 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1125 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1126 Register ImpUseSuper = SrcReg;
1127 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1128 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1129 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1131 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1133 .addReg(SrcSubReg)
1135 .addReg(SrcSubReg)
1136 .addImm(0) // op_sel_lo
1137 .addImm(0) // op_sel_hi
1138 .addImm(0) // neg_lo
1139 .addImm(0) // neg_hi
1140 .addImm(0) // clamp
1141 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1142 if (IsFirstSubreg)
1144 } else {
1145 MachineInstrBuilder Builder =
1146 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1147 if (IsFirstSubreg)
1148 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1149
1150 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1151 }
1152 }
1153}
1154
1155int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1156 int NewOpc;
1157
1158 // Try to map original to commuted opcode
1159 NewOpc = AMDGPU::getCommuteRev(Opcode);
1160 if (NewOpc != -1)
1161 // Check if the commuted (REV) opcode exists on the target.
1162 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1163
1164 // Try to map commuted to original opcode
1165 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1166 if (NewOpc != -1)
1167 // Check if the original (non-REV) opcode exists on the target.
1168 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1169
1170 return Opcode;
1171}
1172
1175 const DebugLoc &DL, Register DestReg,
1176 int64_t Value) const {
1178 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1179 if (RegClass == &AMDGPU::SReg_32RegClass ||
1180 RegClass == &AMDGPU::SGPR_32RegClass ||
1181 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1182 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1183 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1184 .addImm(Value);
1185 return;
1186 }
1187
1188 if (RegClass == &AMDGPU::SReg_64RegClass ||
1189 RegClass == &AMDGPU::SGPR_64RegClass ||
1190 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1191 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1192 .addImm(Value);
1193 return;
1194 }
1195
1196 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1197 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1198 .addImm(Value);
1199 return;
1200 }
1201 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1202 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1203 .addImm(Value);
1204 return;
1205 }
1206
1207 unsigned EltSize = 4;
1208 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1209 if (RI.isSGPRClass(RegClass)) {
1210 if (RI.getRegSizeInBits(*RegClass) > 32) {
1211 Opcode = AMDGPU::S_MOV_B64;
1212 EltSize = 8;
1213 } else {
1214 Opcode = AMDGPU::S_MOV_B32;
1215 EltSize = 4;
1216 }
1217 }
1218
1219 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1220 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1221 int64_t IdxValue = Idx == 0 ? Value : 0;
1222
1223 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1224 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1225 Builder.addImm(IdxValue);
1226 }
1227}
1228
1229const TargetRegisterClass *
1231 return &AMDGPU::VGPR_32RegClass;
1232}
1233
1236 const DebugLoc &DL, Register DstReg,
1238 Register TrueReg,
1239 Register FalseReg) const {
1241 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1242 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1243 "Not a VGPR32 reg");
1244
1245 if (Cond.size() == 1) {
1246 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1247 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1248 .add(Cond[0]);
1249 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1250 .addImm(0)
1251 .addReg(FalseReg)
1252 .addImm(0)
1253 .addReg(TrueReg)
1254 .addReg(SReg);
1255 } else if (Cond.size() == 2) {
1256 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1257 switch (Cond[0].getImm()) {
1258 case SIInstrInfo::SCC_TRUE: {
1259 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1260 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1261 : AMDGPU::S_CSELECT_B64), SReg)
1262 .addImm(1)
1263 .addImm(0);
1264 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1265 .addImm(0)
1266 .addReg(FalseReg)
1267 .addImm(0)
1268 .addReg(TrueReg)
1269 .addReg(SReg);
1270 break;
1271 }
1272 case SIInstrInfo::SCC_FALSE: {
1273 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1274 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1275 : AMDGPU::S_CSELECT_B64), SReg)
1276 .addImm(0)
1277 .addImm(1);
1278 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1279 .addImm(0)
1280 .addReg(FalseReg)
1281 .addImm(0)
1282 .addReg(TrueReg)
1283 .addReg(SReg);
1284 break;
1285 }
1286 case SIInstrInfo::VCCNZ: {
1287 MachineOperand RegOp = Cond[1];
1288 RegOp.setImplicit(false);
1289 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1290 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1291 .add(RegOp);
1292 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1293 .addImm(0)
1294 .addReg(FalseReg)
1295 .addImm(0)
1296 .addReg(TrueReg)
1297 .addReg(SReg);
1298 break;
1299 }
1300 case SIInstrInfo::VCCZ: {
1301 MachineOperand RegOp = Cond[1];
1302 RegOp.setImplicit(false);
1303 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1304 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1305 .add(RegOp);
1306 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1307 .addImm(0)
1308 .addReg(TrueReg)
1309 .addImm(0)
1310 .addReg(FalseReg)
1311 .addReg(SReg);
1312 break;
1313 }
1314 case SIInstrInfo::EXECNZ: {
1315 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1316 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1317 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1318 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1319 .addImm(0);
1320 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1321 : AMDGPU::S_CSELECT_B64), SReg)
1322 .addImm(1)
1323 .addImm(0);
1324 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1325 .addImm(0)
1326 .addReg(FalseReg)
1327 .addImm(0)
1328 .addReg(TrueReg)
1329 .addReg(SReg);
1330 break;
1331 }
1332 case SIInstrInfo::EXECZ: {
1333 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1334 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1335 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1336 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1337 .addImm(0);
1338 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1339 : AMDGPU::S_CSELECT_B64), SReg)
1340 .addImm(0)
1341 .addImm(1);
1342 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1343 .addImm(0)
1344 .addReg(FalseReg)
1345 .addImm(0)
1346 .addReg(TrueReg)
1347 .addReg(SReg);
1348 llvm_unreachable("Unhandled branch predicate EXECZ");
1349 break;
1350 }
1351 default:
1352 llvm_unreachable("invalid branch predicate");
1353 }
1354 } else {
1355 llvm_unreachable("Can only handle Cond size 1 or 2");
1356 }
1357}
1358
1361 const DebugLoc &DL,
1362 Register SrcReg, int Value) const {
1364 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1365 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1366 .addImm(Value)
1367 .addReg(SrcReg);
1368
1369 return Reg;
1370}
1371
1374 const DebugLoc &DL,
1375 Register SrcReg, int Value) const {
1377 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1378 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1379 .addImm(Value)
1380 .addReg(SrcReg);
1381
1382 return Reg;
1383}
1384
1386
1387 if (RI.isAGPRClass(DstRC))
1388 return AMDGPU::COPY;
1389 if (RI.getRegSizeInBits(*DstRC) == 16) {
1390 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1391 // before RA.
1392 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1393 }
1394 if (RI.getRegSizeInBits(*DstRC) == 32)
1395 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1396 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1397 return AMDGPU::S_MOV_B64;
1398 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1399 return AMDGPU::V_MOV_B64_PSEUDO;
1400 return AMDGPU::COPY;
1401}
1402
1403const MCInstrDesc &
1405 bool IsIndirectSrc) const {
1406 if (IsIndirectSrc) {
1407 if (VecSize <= 32) // 4 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1409 if (VecSize <= 64) // 8 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1411 if (VecSize <= 96) // 12 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1413 if (VecSize <= 128) // 16 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1415 if (VecSize <= 160) // 20 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1417 if (VecSize <= 256) // 32 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1419 if (VecSize <= 288) // 36 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1421 if (VecSize <= 320) // 40 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1423 if (VecSize <= 352) // 44 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1425 if (VecSize <= 384) // 48 bytes
1426 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1427 if (VecSize <= 512) // 64 bytes
1428 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1429 if (VecSize <= 1024) // 128 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1431
1432 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1433 }
1434
1435 if (VecSize <= 32) // 4 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1437 if (VecSize <= 64) // 8 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1439 if (VecSize <= 96) // 12 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1441 if (VecSize <= 128) // 16 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1443 if (VecSize <= 160) // 20 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1445 if (VecSize <= 256) // 32 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1447 if (VecSize <= 288) // 36 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1449 if (VecSize <= 320) // 40 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1451 if (VecSize <= 352) // 44 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1453 if (VecSize <= 384) // 48 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1455 if (VecSize <= 512) // 64 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1457 if (VecSize <= 1024) // 128 bytes
1458 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1459
1460 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1461}
1462
1463static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1464 if (VecSize <= 32) // 4 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1466 if (VecSize <= 64) // 8 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1468 if (VecSize <= 96) // 12 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1470 if (VecSize <= 128) // 16 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1472 if (VecSize <= 160) // 20 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1474 if (VecSize <= 256) // 32 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1476 if (VecSize <= 288) // 36 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1478 if (VecSize <= 320) // 40 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1480 if (VecSize <= 352) // 44 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1482 if (VecSize <= 384) // 48 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1484 if (VecSize <= 512) // 64 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1486 if (VecSize <= 1024) // 128 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1488
1489 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1490}
1491
1492static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1493 if (VecSize <= 32) // 4 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1495 if (VecSize <= 64) // 8 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1497 if (VecSize <= 96) // 12 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1499 if (VecSize <= 128) // 16 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1501 if (VecSize <= 160) // 20 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1503 if (VecSize <= 256) // 32 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1505 if (VecSize <= 288) // 36 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1507 if (VecSize <= 320) // 40 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1509 if (VecSize <= 352) // 44 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1511 if (VecSize <= 384) // 48 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1513 if (VecSize <= 512) // 64 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1515 if (VecSize <= 1024) // 128 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1517
1518 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1519}
1520
1521static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1522 if (VecSize <= 64) // 8 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1524 if (VecSize <= 128) // 16 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1526 if (VecSize <= 256) // 32 bytes
1527 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1528 if (VecSize <= 512) // 64 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1530 if (VecSize <= 1024) // 128 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1532
1533 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1534}
1535
1536const MCInstrDesc &
1537SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1538 bool IsSGPR) const {
1539 if (IsSGPR) {
1540 switch (EltSize) {
1541 case 32:
1542 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1543 case 64:
1544 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1545 default:
1546 llvm_unreachable("invalid reg indexing elt size");
1547 }
1548 }
1549
1550 assert(EltSize == 32 && "invalid reg indexing elt size");
1552}
1553
1554static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1555 switch (Size) {
1556 case 4:
1557 return AMDGPU::SI_SPILL_S32_SAVE;
1558 case 8:
1559 return AMDGPU::SI_SPILL_S64_SAVE;
1560 case 12:
1561 return AMDGPU::SI_SPILL_S96_SAVE;
1562 case 16:
1563 return AMDGPU::SI_SPILL_S128_SAVE;
1564 case 20:
1565 return AMDGPU::SI_SPILL_S160_SAVE;
1566 case 24:
1567 return AMDGPU::SI_SPILL_S192_SAVE;
1568 case 28:
1569 return AMDGPU::SI_SPILL_S224_SAVE;
1570 case 32:
1571 return AMDGPU::SI_SPILL_S256_SAVE;
1572 case 36:
1573 return AMDGPU::SI_SPILL_S288_SAVE;
1574 case 40:
1575 return AMDGPU::SI_SPILL_S320_SAVE;
1576 case 44:
1577 return AMDGPU::SI_SPILL_S352_SAVE;
1578 case 48:
1579 return AMDGPU::SI_SPILL_S384_SAVE;
1580 case 64:
1581 return AMDGPU::SI_SPILL_S512_SAVE;
1582 case 128:
1583 return AMDGPU::SI_SPILL_S1024_SAVE;
1584 default:
1585 llvm_unreachable("unknown register size");
1586 }
1587}
1588
1589static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1590 switch (Size) {
1591 case 4:
1592 return AMDGPU::SI_SPILL_V32_SAVE;
1593 case 8:
1594 return AMDGPU::SI_SPILL_V64_SAVE;
1595 case 12:
1596 return AMDGPU::SI_SPILL_V96_SAVE;
1597 case 16:
1598 return AMDGPU::SI_SPILL_V128_SAVE;
1599 case 20:
1600 return AMDGPU::SI_SPILL_V160_SAVE;
1601 case 24:
1602 return AMDGPU::SI_SPILL_V192_SAVE;
1603 case 28:
1604 return AMDGPU::SI_SPILL_V224_SAVE;
1605 case 32:
1606 return AMDGPU::SI_SPILL_V256_SAVE;
1607 case 36:
1608 return AMDGPU::SI_SPILL_V288_SAVE;
1609 case 40:
1610 return AMDGPU::SI_SPILL_V320_SAVE;
1611 case 44:
1612 return AMDGPU::SI_SPILL_V352_SAVE;
1613 case 48:
1614 return AMDGPU::SI_SPILL_V384_SAVE;
1615 case 64:
1616 return AMDGPU::SI_SPILL_V512_SAVE;
1617 case 128:
1618 return AMDGPU::SI_SPILL_V1024_SAVE;
1619 default:
1620 llvm_unreachable("unknown register size");
1621 }
1622}
1623
1624static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1625 switch (Size) {
1626 case 4:
1627 return AMDGPU::SI_SPILL_A32_SAVE;
1628 case 8:
1629 return AMDGPU::SI_SPILL_A64_SAVE;
1630 case 12:
1631 return AMDGPU::SI_SPILL_A96_SAVE;
1632 case 16:
1633 return AMDGPU::SI_SPILL_A128_SAVE;
1634 case 20:
1635 return AMDGPU::SI_SPILL_A160_SAVE;
1636 case 24:
1637 return AMDGPU::SI_SPILL_A192_SAVE;
1638 case 28:
1639 return AMDGPU::SI_SPILL_A224_SAVE;
1640 case 32:
1641 return AMDGPU::SI_SPILL_A256_SAVE;
1642 case 36:
1643 return AMDGPU::SI_SPILL_A288_SAVE;
1644 case 40:
1645 return AMDGPU::SI_SPILL_A320_SAVE;
1646 case 44:
1647 return AMDGPU::SI_SPILL_A352_SAVE;
1648 case 48:
1649 return AMDGPU::SI_SPILL_A384_SAVE;
1650 case 64:
1651 return AMDGPU::SI_SPILL_A512_SAVE;
1652 case 128:
1653 return AMDGPU::SI_SPILL_A1024_SAVE;
1654 default:
1655 llvm_unreachable("unknown register size");
1656 }
1657}
1658
1659static unsigned getAVSpillSaveOpcode(unsigned Size) {
1660 switch (Size) {
1661 case 4:
1662 return AMDGPU::SI_SPILL_AV32_SAVE;
1663 case 8:
1664 return AMDGPU::SI_SPILL_AV64_SAVE;
1665 case 12:
1666 return AMDGPU::SI_SPILL_AV96_SAVE;
1667 case 16:
1668 return AMDGPU::SI_SPILL_AV128_SAVE;
1669 case 20:
1670 return AMDGPU::SI_SPILL_AV160_SAVE;
1671 case 24:
1672 return AMDGPU::SI_SPILL_AV192_SAVE;
1673 case 28:
1674 return AMDGPU::SI_SPILL_AV224_SAVE;
1675 case 32:
1676 return AMDGPU::SI_SPILL_AV256_SAVE;
1677 case 36:
1678 return AMDGPU::SI_SPILL_AV288_SAVE;
1679 case 40:
1680 return AMDGPU::SI_SPILL_AV320_SAVE;
1681 case 44:
1682 return AMDGPU::SI_SPILL_AV352_SAVE;
1683 case 48:
1684 return AMDGPU::SI_SPILL_AV384_SAVE;
1685 case 64:
1686 return AMDGPU::SI_SPILL_AV512_SAVE;
1687 case 128:
1688 return AMDGPU::SI_SPILL_AV1024_SAVE;
1689 default:
1690 llvm_unreachable("unknown register size");
1691 }
1692}
1693
1694static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1695 bool IsVectorSuperClass) {
1696 // Currently, there is only 32-bit WWM register spills needed.
1697 if (Size != 4)
1698 llvm_unreachable("unknown wwm register spill size");
1699
1700 if (IsVectorSuperClass)
1701 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1702
1703 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1704}
1705
1707 const TargetRegisterClass *RC,
1708 unsigned Size,
1709 const SIRegisterInfo &TRI,
1710 const SIMachineFunctionInfo &MFI) {
1711 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1712
1713 // Choose the right opcode if spilling a WWM register.
1715 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1716
1717 if (IsVectorSuperClass)
1718 return getAVSpillSaveOpcode(Size);
1719
1720 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1722}
1723
1726 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1727 const TargetRegisterInfo *TRI, Register VReg) const {
1730 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1731 const DebugLoc &DL = MBB.findDebugLoc(MI);
1732
1733 MachinePointerInfo PtrInfo
1734 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1736 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1737 FrameInfo.getObjectAlign(FrameIndex));
1738 unsigned SpillSize = TRI->getSpillSize(*RC);
1739
1741 if (RI.isSGPRClass(RC)) {
1742 MFI->setHasSpilledSGPRs();
1743 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1744 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1745 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1746
1747 // We are only allowed to create one new instruction when spilling
1748 // registers, so we need to use pseudo instruction for spilling SGPRs.
1749 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1750
1751 // The SGPR spill/restore instructions only work on number sgprs, so we need
1752 // to make sure we are using the correct register class.
1753 if (SrcReg.isVirtual() && SpillSize == 4) {
1754 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1755 }
1756
1757 BuildMI(MBB, MI, DL, OpDesc)
1758 .addReg(SrcReg, getKillRegState(isKill)) // data
1759 .addFrameIndex(FrameIndex) // addr
1760 .addMemOperand(MMO)
1762
1763 if (RI.spillSGPRToVGPR())
1764 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1765 return;
1766 }
1767
1768 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1769 SpillSize, RI, *MFI);
1770 MFI->setHasSpilledVGPRs();
1771
1772 BuildMI(MBB, MI, DL, get(Opcode))
1773 .addReg(SrcReg, getKillRegState(isKill)) // data
1774 .addFrameIndex(FrameIndex) // addr
1775 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1776 .addImm(0) // offset
1777 .addMemOperand(MMO);
1778}
1779
1780static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1781 switch (Size) {
1782 case 4:
1783 return AMDGPU::SI_SPILL_S32_RESTORE;
1784 case 8:
1785 return AMDGPU::SI_SPILL_S64_RESTORE;
1786 case 12:
1787 return AMDGPU::SI_SPILL_S96_RESTORE;
1788 case 16:
1789 return AMDGPU::SI_SPILL_S128_RESTORE;
1790 case 20:
1791 return AMDGPU::SI_SPILL_S160_RESTORE;
1792 case 24:
1793 return AMDGPU::SI_SPILL_S192_RESTORE;
1794 case 28:
1795 return AMDGPU::SI_SPILL_S224_RESTORE;
1796 case 32:
1797 return AMDGPU::SI_SPILL_S256_RESTORE;
1798 case 36:
1799 return AMDGPU::SI_SPILL_S288_RESTORE;
1800 case 40:
1801 return AMDGPU::SI_SPILL_S320_RESTORE;
1802 case 44:
1803 return AMDGPU::SI_SPILL_S352_RESTORE;
1804 case 48:
1805 return AMDGPU::SI_SPILL_S384_RESTORE;
1806 case 64:
1807 return AMDGPU::SI_SPILL_S512_RESTORE;
1808 case 128:
1809 return AMDGPU::SI_SPILL_S1024_RESTORE;
1810 default:
1811 llvm_unreachable("unknown register size");
1812 }
1813}
1814
1815static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1816 switch (Size) {
1817 case 4:
1818 return AMDGPU::SI_SPILL_V32_RESTORE;
1819 case 8:
1820 return AMDGPU::SI_SPILL_V64_RESTORE;
1821 case 12:
1822 return AMDGPU::SI_SPILL_V96_RESTORE;
1823 case 16:
1824 return AMDGPU::SI_SPILL_V128_RESTORE;
1825 case 20:
1826 return AMDGPU::SI_SPILL_V160_RESTORE;
1827 case 24:
1828 return AMDGPU::SI_SPILL_V192_RESTORE;
1829 case 28:
1830 return AMDGPU::SI_SPILL_V224_RESTORE;
1831 case 32:
1832 return AMDGPU::SI_SPILL_V256_RESTORE;
1833 case 36:
1834 return AMDGPU::SI_SPILL_V288_RESTORE;
1835 case 40:
1836 return AMDGPU::SI_SPILL_V320_RESTORE;
1837 case 44:
1838 return AMDGPU::SI_SPILL_V352_RESTORE;
1839 case 48:
1840 return AMDGPU::SI_SPILL_V384_RESTORE;
1841 case 64:
1842 return AMDGPU::SI_SPILL_V512_RESTORE;
1843 case 128:
1844 return AMDGPU::SI_SPILL_V1024_RESTORE;
1845 default:
1846 llvm_unreachable("unknown register size");
1847 }
1848}
1849
1850static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1851 switch (Size) {
1852 case 4:
1853 return AMDGPU::SI_SPILL_A32_RESTORE;
1854 case 8:
1855 return AMDGPU::SI_SPILL_A64_RESTORE;
1856 case 12:
1857 return AMDGPU::SI_SPILL_A96_RESTORE;
1858 case 16:
1859 return AMDGPU::SI_SPILL_A128_RESTORE;
1860 case 20:
1861 return AMDGPU::SI_SPILL_A160_RESTORE;
1862 case 24:
1863 return AMDGPU::SI_SPILL_A192_RESTORE;
1864 case 28:
1865 return AMDGPU::SI_SPILL_A224_RESTORE;
1866 case 32:
1867 return AMDGPU::SI_SPILL_A256_RESTORE;
1868 case 36:
1869 return AMDGPU::SI_SPILL_A288_RESTORE;
1870 case 40:
1871 return AMDGPU::SI_SPILL_A320_RESTORE;
1872 case 44:
1873 return AMDGPU::SI_SPILL_A352_RESTORE;
1874 case 48:
1875 return AMDGPU::SI_SPILL_A384_RESTORE;
1876 case 64:
1877 return AMDGPU::SI_SPILL_A512_RESTORE;
1878 case 128:
1879 return AMDGPU::SI_SPILL_A1024_RESTORE;
1880 default:
1881 llvm_unreachable("unknown register size");
1882 }
1883}
1884
1885static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1886 switch (Size) {
1887 case 4:
1888 return AMDGPU::SI_SPILL_AV32_RESTORE;
1889 case 8:
1890 return AMDGPU::SI_SPILL_AV64_RESTORE;
1891 case 12:
1892 return AMDGPU::SI_SPILL_AV96_RESTORE;
1893 case 16:
1894 return AMDGPU::SI_SPILL_AV128_RESTORE;
1895 case 20:
1896 return AMDGPU::SI_SPILL_AV160_RESTORE;
1897 case 24:
1898 return AMDGPU::SI_SPILL_AV192_RESTORE;
1899 case 28:
1900 return AMDGPU::SI_SPILL_AV224_RESTORE;
1901 case 32:
1902 return AMDGPU::SI_SPILL_AV256_RESTORE;
1903 case 36:
1904 return AMDGPU::SI_SPILL_AV288_RESTORE;
1905 case 40:
1906 return AMDGPU::SI_SPILL_AV320_RESTORE;
1907 case 44:
1908 return AMDGPU::SI_SPILL_AV352_RESTORE;
1909 case 48:
1910 return AMDGPU::SI_SPILL_AV384_RESTORE;
1911 case 64:
1912 return AMDGPU::SI_SPILL_AV512_RESTORE;
1913 case 128:
1914 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1915 default:
1916 llvm_unreachable("unknown register size");
1917 }
1918}
1919
1920static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1921 bool IsVectorSuperClass) {
1922 // Currently, there is only 32-bit WWM register spills needed.
1923 if (Size != 4)
1924 llvm_unreachable("unknown wwm register spill size");
1925
1926 if (IsVectorSuperClass)
1927 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1928
1929 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1930}
1931
1932static unsigned
1934 unsigned Size, const SIRegisterInfo &TRI,
1935 const SIMachineFunctionInfo &MFI) {
1936 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1937
1938 // Choose the right opcode if restoring a WWM register.
1940 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1941
1942 if (IsVectorSuperClass)
1944
1945 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1947}
1948
1951 Register DestReg, int FrameIndex,
1952 const TargetRegisterClass *RC,
1953 const TargetRegisterInfo *TRI,
1954 Register VReg) const {
1957 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1958 const DebugLoc &DL = MBB.findDebugLoc(MI);
1959 unsigned SpillSize = TRI->getSpillSize(*RC);
1960
1961 MachinePointerInfo PtrInfo
1962 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1963
1965 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1966 FrameInfo.getObjectAlign(FrameIndex));
1967
1968 if (RI.isSGPRClass(RC)) {
1969 MFI->setHasSpilledSGPRs();
1970 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1971 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1972 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1973
1974 // FIXME: Maybe this should not include a memoperand because it will be
1975 // lowered to non-memory instructions.
1976 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1977 if (DestReg.isVirtual() && SpillSize == 4) {
1979 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1980 }
1981
1982 if (RI.spillSGPRToVGPR())
1983 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1984 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1985 .addFrameIndex(FrameIndex) // addr
1986 .addMemOperand(MMO)
1988
1989 return;
1990 }
1991
1992 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1993 SpillSize, RI, *MFI);
1994 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1995 .addFrameIndex(FrameIndex) // vaddr
1996 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1997 .addImm(0) // offset
1998 .addMemOperand(MMO);
1999}
2000
2003 insertNoops(MBB, MI, 1);
2004}
2005
2008 unsigned Quantity) const {
2010 while (Quantity > 0) {
2011 unsigned Arg = std::min(Quantity, 8u);
2012 Quantity -= Arg;
2013 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2014 }
2015}
2016
2018 auto *MF = MBB.getParent();
2020
2021 assert(Info->isEntryFunction());
2022
2023 if (MBB.succ_empty()) {
2024 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2025 if (HasNoTerminator) {
2026 if (Info->returnsVoid()) {
2027 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2028 } else {
2029 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2030 }
2031 }
2032 }
2033}
2034
2038 const DebugLoc &DL) const {
2040 constexpr unsigned DoorbellIDMask = 0x3ff;
2041 constexpr unsigned ECQueueWaveAbort = 0x400;
2042
2043 MachineBasicBlock *TrapBB = &MBB;
2044 MachineBasicBlock *ContBB = &MBB;
2045 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2046
2047 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2048 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2049 TrapBB = MF->CreateMachineBasicBlock();
2050 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2051 MF->push_back(TrapBB);
2052 MBB.addSuccessor(TrapBB);
2053 }
2054
2055 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2056 // will be a nop.
2057 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2058 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2059 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2060 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2061 DoorbellReg)
2063 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2064 .addUse(AMDGPU::M0);
2065 Register DoorbellRegMasked =
2066 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2067 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2068 .addUse(DoorbellReg)
2069 .addImm(DoorbellIDMask);
2070 Register SetWaveAbortBit =
2071 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2072 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2073 .addUse(DoorbellRegMasked)
2074 .addImm(ECQueueWaveAbort);
2075 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2076 .addUse(SetWaveAbortBit);
2077 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2079 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2080 .addUse(AMDGPU::TTMP2);
2081 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2082 TrapBB->addSuccessor(HaltLoopBB);
2083
2084 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2085 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2086 .addMBB(HaltLoopBB);
2087 MF->push_back(HaltLoopBB);
2088 HaltLoopBB->addSuccessor(HaltLoopBB);
2089
2090 return ContBB;
2091}
2092
2094 switch (MI.getOpcode()) {
2095 default:
2096 if (MI.isMetaInstruction())
2097 return 0;
2098 return 1; // FIXME: Do wait states equal cycles?
2099
2100 case AMDGPU::S_NOP:
2101 return MI.getOperand(0).getImm() + 1;
2102 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2103 // hazard, even if one exist, won't really be visible. Should we handle it?
2104 }
2105}
2106
2108 MachineBasicBlock &MBB = *MI.getParent();
2110 switch (MI.getOpcode()) {
2111 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2112 case AMDGPU::S_MOV_B64_term:
2113 // This is only a terminator to get the correct spill code placement during
2114 // register allocation.
2115 MI.setDesc(get(AMDGPU::S_MOV_B64));
2116 break;
2117
2118 case AMDGPU::S_MOV_B32_term:
2119 // This is only a terminator to get the correct spill code placement during
2120 // register allocation.
2121 MI.setDesc(get(AMDGPU::S_MOV_B32));
2122 break;
2123
2124 case AMDGPU::S_XOR_B64_term:
2125 // This is only a terminator to get the correct spill code placement during
2126 // register allocation.
2127 MI.setDesc(get(AMDGPU::S_XOR_B64));
2128 break;
2129
2130 case AMDGPU::S_XOR_B32_term:
2131 // This is only a terminator to get the correct spill code placement during
2132 // register allocation.
2133 MI.setDesc(get(AMDGPU::S_XOR_B32));
2134 break;
2135 case AMDGPU::S_OR_B64_term:
2136 // This is only a terminator to get the correct spill code placement during
2137 // register allocation.
2138 MI.setDesc(get(AMDGPU::S_OR_B64));
2139 break;
2140 case AMDGPU::S_OR_B32_term:
2141 // This is only a terminator to get the correct spill code placement during
2142 // register allocation.
2143 MI.setDesc(get(AMDGPU::S_OR_B32));
2144 break;
2145
2146 case AMDGPU::S_ANDN2_B64_term:
2147 // This is only a terminator to get the correct spill code placement during
2148 // register allocation.
2149 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2150 break;
2151
2152 case AMDGPU::S_ANDN2_B32_term:
2153 // This is only a terminator to get the correct spill code placement during
2154 // register allocation.
2155 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2156 break;
2157
2158 case AMDGPU::S_AND_B64_term:
2159 // This is only a terminator to get the correct spill code placement during
2160 // register allocation.
2161 MI.setDesc(get(AMDGPU::S_AND_B64));
2162 break;
2163
2164 case AMDGPU::S_AND_B32_term:
2165 // This is only a terminator to get the correct spill code placement during
2166 // register allocation.
2167 MI.setDesc(get(AMDGPU::S_AND_B32));
2168 break;
2169
2170 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2171 // This is only a terminator to get the correct spill code placement during
2172 // register allocation.
2173 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2174 break;
2175
2176 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2177 // This is only a terminator to get the correct spill code placement during
2178 // register allocation.
2179 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2180 break;
2181
2182 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2183 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2184 break;
2185
2186 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2187 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2188 break;
2189
2190 case AMDGPU::V_MOV_B64_PSEUDO: {
2191 Register Dst = MI.getOperand(0).getReg();
2192 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2193 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2194
2195 const MachineOperand &SrcOp = MI.getOperand(1);
2196 // FIXME: Will this work for 64-bit floating point immediates?
2197 assert(!SrcOp.isFPImm());
2198 if (ST.hasMovB64()) {
2199 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2200 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2201 isUInt<32>(SrcOp.getImm()))
2202 break;
2203 }
2204 if (SrcOp.isImm()) {
2205 APInt Imm(64, SrcOp.getImm());
2206 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2207 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2208 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2209 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2211 .addImm(Lo.getSExtValue())
2213 .addImm(Lo.getSExtValue())
2214 .addImm(0) // op_sel_lo
2215 .addImm(0) // op_sel_hi
2216 .addImm(0) // neg_lo
2217 .addImm(0) // neg_hi
2218 .addImm(0); // clamp
2219 } else {
2220 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2221 .addImm(Lo.getSExtValue())
2223 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2224 .addImm(Hi.getSExtValue())
2226 }
2227 } else {
2228 assert(SrcOp.isReg());
2229 if (ST.hasPkMovB32() &&
2230 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2231 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2232 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2233 .addReg(SrcOp.getReg())
2235 .addReg(SrcOp.getReg())
2236 .addImm(0) // op_sel_lo
2237 .addImm(0) // op_sel_hi
2238 .addImm(0) // neg_lo
2239 .addImm(0) // neg_hi
2240 .addImm(0); // clamp
2241 } else {
2242 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2243 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2245 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2246 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2248 }
2249 }
2250 MI.eraseFromParent();
2251 break;
2252 }
2253 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2255 break;
2256 }
2257 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2258 const MachineOperand &SrcOp = MI.getOperand(1);
2259 assert(!SrcOp.isFPImm());
2260 APInt Imm(64, SrcOp.getImm());
2261 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2262 MI.setDesc(get(AMDGPU::S_MOV_B64));
2263 break;
2264 }
2265
2266 Register Dst = MI.getOperand(0).getReg();
2267 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2268 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2269
2270 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2271 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2272 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2273 .addImm(Lo.getSExtValue())
2275 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2276 .addImm(Hi.getSExtValue())
2278 MI.eraseFromParent();
2279 break;
2280 }
2281 case AMDGPU::V_SET_INACTIVE_B32: {
2282 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2283 Register DstReg = MI.getOperand(0).getReg();
2284 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2285 .add(MI.getOperand(3))
2286 .add(MI.getOperand(4))
2287 .add(MI.getOperand(1))
2288 .add(MI.getOperand(2))
2289 .add(MI.getOperand(5));
2290 MI.eraseFromParent();
2291 break;
2292 }
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2294 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2298 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2305 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2322 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2323
2324 unsigned Opc;
2325 if (RI.hasVGPRs(EltRC)) {
2326 Opc = AMDGPU::V_MOVRELD_B32_e32;
2327 } else {
2328 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2329 : AMDGPU::S_MOVRELD_B32;
2330 }
2331
2332 const MCInstrDesc &OpDesc = get(Opc);
2333 Register VecReg = MI.getOperand(0).getReg();
2334 bool IsUndef = MI.getOperand(1).isUndef();
2335 unsigned SubReg = MI.getOperand(3).getImm();
2336 assert(VecReg == MI.getOperand(1).getReg());
2337
2339 BuildMI(MBB, MI, DL, OpDesc)
2340 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2341 .add(MI.getOperand(2))
2343 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2344
2345 const int ImpDefIdx =
2346 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2347 const int ImpUseIdx = ImpDefIdx + 1;
2348 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2349 MI.eraseFromParent();
2350 break;
2351 }
2352 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2353 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2357 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2358 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2359 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2360 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2361 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2362 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2363 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2365 Register VecReg = MI.getOperand(0).getReg();
2366 bool IsUndef = MI.getOperand(1).isUndef();
2367 Register Idx = MI.getOperand(3).getReg();
2368 Register SubReg = MI.getOperand(4).getImm();
2369
2370 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2371 .addReg(Idx)
2373 SetOn->getOperand(3).setIsUndef();
2374
2375 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2377 BuildMI(MBB, MI, DL, OpDesc)
2378 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2379 .add(MI.getOperand(2))
2381 .addReg(VecReg,
2382 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2383
2384 const int ImpDefIdx =
2385 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2386 const int ImpUseIdx = ImpDefIdx + 1;
2387 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2388
2389 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2390
2391 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2392
2393 MI.eraseFromParent();
2394 break;
2395 }
2396 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2397 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2398 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2402 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2403 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2404 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2405 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2406 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2407 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2409 Register Dst = MI.getOperand(0).getReg();
2410 Register VecReg = MI.getOperand(1).getReg();
2411 bool IsUndef = MI.getOperand(1).isUndef();
2412 Register Idx = MI.getOperand(2).getReg();
2413 Register SubReg = MI.getOperand(3).getImm();
2414
2415 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2416 .addReg(Idx)
2418 SetOn->getOperand(3).setIsUndef();
2419
2420 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2421 .addDef(Dst)
2422 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2423 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2424
2425 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2426
2427 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2428
2429 MI.eraseFromParent();
2430 break;
2431 }
2432 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2433 MachineFunction &MF = *MBB.getParent();
2434 Register Reg = MI.getOperand(0).getReg();
2435 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2436 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2437 MachineOperand OpLo = MI.getOperand(1);
2438 MachineOperand OpHi = MI.getOperand(2);
2439
2440 // Create a bundle so these instructions won't be re-ordered by the
2441 // post-RA scheduler.
2442 MIBundleBuilder Bundler(MBB, MI);
2443 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2444
2445 // What we want here is an offset from the value returned by s_getpc (which
2446 // is the address of the s_add_u32 instruction) to the global variable, but
2447 // since the encoding of $symbol starts 4 bytes after the start of the
2448 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2449 // small. This requires us to add 4 to the global variable offset in order
2450 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2451 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2452 // instruction.
2453
2454 int64_t Adjust = 0;
2455 if (ST.hasGetPCZeroExtension()) {
2456 // Fix up hardware that does not sign-extend the 48-bit PC value by
2457 // inserting: s_sext_i32_i16 reghi, reghi
2458 Bundler.append(
2459 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2460 Adjust += 4;
2461 }
2462
2463 if (OpLo.isGlobal())
2464 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2465 Bundler.append(
2466 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2467
2468 if (OpHi.isGlobal())
2469 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2470 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2471 .addReg(RegHi)
2472 .add(OpHi));
2473
2474 finalizeBundle(MBB, Bundler.begin());
2475
2476 MI.eraseFromParent();
2477 break;
2478 }
2479 case AMDGPU::ENTER_STRICT_WWM: {
2480 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2481 // Whole Wave Mode is entered.
2482 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2483 : AMDGPU::S_OR_SAVEEXEC_B64));
2484 break;
2485 }
2486 case AMDGPU::ENTER_STRICT_WQM: {
2487 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2488 // STRICT_WQM is entered.
2489 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2490 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2491 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2492 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2493 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2494
2495 MI.eraseFromParent();
2496 break;
2497 }
2498 case AMDGPU::EXIT_STRICT_WWM:
2499 case AMDGPU::EXIT_STRICT_WQM: {
2500 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2501 // WWM/STICT_WQM is exited.
2502 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2503 break;
2504 }
2505 case AMDGPU::SI_RETURN: {
2506 const MachineFunction *MF = MBB.getParent();
2507 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2508 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2509 // Hiding the return address use with SI_RETURN may lead to extra kills in
2510 // the function and missing live-ins. We are fine in practice because callee
2511 // saved register handling ensures the register value is restored before
2512 // RET, but we need the undef flag here to appease the MachineVerifier
2513 // liveness checks.
2515 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2516 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2517
2518 MIB.copyImplicitOps(MI);
2519 MI.eraseFromParent();
2520 break;
2521 }
2522
2523 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2524 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2525 MI.setDesc(get(AMDGPU::S_MUL_U64));
2526 break;
2527
2528 case AMDGPU::S_GETPC_B64_pseudo:
2529 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2530 if (ST.hasGetPCZeroExtension()) {
2531 Register Dst = MI.getOperand(0).getReg();
2532 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2533 // Fix up hardware that does not sign-extend the 48-bit PC value by
2534 // inserting: s_sext_i32_i16 dsthi, dsthi
2535 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2536 DstHi)
2537 .addReg(DstHi);
2538 }
2539 break;
2540 }
2541 return true;
2542}
2543
2546 unsigned SubIdx, const MachineInstr &Orig,
2547 const TargetRegisterInfo &RI) const {
2548
2549 // Try shrinking the instruction to remat only the part needed for current
2550 // context.
2551 // TODO: Handle more cases.
2552 unsigned Opcode = Orig.getOpcode();
2553 switch (Opcode) {
2554 case AMDGPU::S_LOAD_DWORDX16_IMM:
2555 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2556 if (SubIdx != 0)
2557 break;
2558
2559 if (I == MBB.end())
2560 break;
2561
2562 if (I->isBundled())
2563 break;
2564
2565 // Look for a single use of the register that is also a subreg.
2566 Register RegToFind = Orig.getOperand(0).getReg();
2567 MachineOperand *UseMO = nullptr;
2568 for (auto &CandMO : I->operands()) {
2569 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2570 continue;
2571 if (UseMO) {
2572 UseMO = nullptr;
2573 break;
2574 }
2575 UseMO = &CandMO;
2576 }
2577 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2578 break;
2579
2580 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2581 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2582
2585 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2586
2587 unsigned NewOpcode = -1;
2588 if (SubregSize == 256)
2589 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2590 else if (SubregSize == 128)
2591 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2592 else
2593 break;
2594
2595 const MCInstrDesc &TID = get(NewOpcode);
2596 const TargetRegisterClass *NewRC =
2597 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2598 MRI.setRegClass(DestReg, NewRC);
2599
2600 UseMO->setReg(DestReg);
2601 UseMO->setSubReg(AMDGPU::NoSubRegister);
2602
2603 // Use a smaller load with the desired size, possibly with updated offset.
2604 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2605 MI->setDesc(TID);
2606 MI->getOperand(0).setReg(DestReg);
2607 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2608 if (Offset) {
2609 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2610 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2611 OffsetMO->setImm(FinalOffset);
2612 }
2614 for (const MachineMemOperand *MemOp : Orig.memoperands())
2615 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2616 SubregSize / 8));
2617 MI->setMemRefs(*MF, NewMMOs);
2618
2619 MBB.insert(I, MI);
2620 return;
2621 }
2622
2623 default:
2624 break;
2625 }
2626
2627 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2628}
2629
2630std::pair<MachineInstr*, MachineInstr*>
2632 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2633
2634 if (ST.hasMovB64() &&
2636 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2637 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2638 return std::pair(&MI, nullptr);
2639 }
2640
2641 MachineBasicBlock &MBB = *MI.getParent();
2645 Register Dst = MI.getOperand(0).getReg();
2646 unsigned Part = 0;
2647 MachineInstr *Split[2];
2648
2649 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2650 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2651 if (Dst.isPhysical()) {
2652 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2653 } else {
2654 assert(MRI.isSSA());
2655 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2656 MovDPP.addDef(Tmp);
2657 }
2658
2659 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2660 const MachineOperand &SrcOp = MI.getOperand(I);
2661 assert(!SrcOp.isFPImm());
2662 if (SrcOp.isImm()) {
2663 APInt Imm(64, SrcOp.getImm());
2664 Imm.ashrInPlace(Part * 32);
2665 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2666 } else {
2667 assert(SrcOp.isReg());
2668 Register Src = SrcOp.getReg();
2669 if (Src.isPhysical())
2670 MovDPP.addReg(RI.getSubReg(Src, Sub));
2671 else
2672 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2673 }
2674 }
2675
2676 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2677 MovDPP.addImm(MO.getImm());
2678
2679 Split[Part] = MovDPP;
2680 ++Part;
2681 }
2682
2683 if (Dst.isVirtual())
2684 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2685 .addReg(Split[0]->getOperand(0).getReg())
2686 .addImm(AMDGPU::sub0)
2687 .addReg(Split[1]->getOperand(0).getReg())
2688 .addImm(AMDGPU::sub1);
2689
2690 MI.eraseFromParent();
2691 return std::pair(Split[0], Split[1]);
2692}
2693
2694std::optional<DestSourcePair>
2696 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2697 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2698
2699 return std::nullopt;
2700}
2701
2703 MachineOperand &Src0,
2704 unsigned Src0OpName,
2705 MachineOperand &Src1,
2706 unsigned Src1OpName) const {
2707 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2708 if (!Src0Mods)
2709 return false;
2710
2711 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2712 assert(Src1Mods &&
2713 "All commutable instructions have both src0 and src1 modifiers");
2714
2715 int Src0ModsVal = Src0Mods->getImm();
2716 int Src1ModsVal = Src1Mods->getImm();
2717
2718 Src1Mods->setImm(Src0ModsVal);
2719 Src0Mods->setImm(Src1ModsVal);
2720 return true;
2721}
2722
2724 MachineOperand &RegOp,
2725 MachineOperand &NonRegOp) {
2726 Register Reg = RegOp.getReg();
2727 unsigned SubReg = RegOp.getSubReg();
2728 bool IsKill = RegOp.isKill();
2729 bool IsDead = RegOp.isDead();
2730 bool IsUndef = RegOp.isUndef();
2731 bool IsDebug = RegOp.isDebug();
2732
2733 if (NonRegOp.isImm())
2734 RegOp.ChangeToImmediate(NonRegOp.getImm());
2735 else if (NonRegOp.isFI())
2736 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2737 else if (NonRegOp.isGlobal()) {
2738 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2739 NonRegOp.getTargetFlags());
2740 } else
2741 return nullptr;
2742
2743 // Make sure we don't reinterpret a subreg index in the target flags.
2744 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2745
2746 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2747 NonRegOp.setSubReg(SubReg);
2748
2749 return &MI;
2750}
2751
2753 unsigned Src0Idx,
2754 unsigned Src1Idx) const {
2755 assert(!NewMI && "this should never be used");
2756
2757 unsigned Opc = MI.getOpcode();
2758 int CommutedOpcode = commuteOpcode(Opc);
2759 if (CommutedOpcode == -1)
2760 return nullptr;
2761
2762 if (Src0Idx > Src1Idx)
2763 std::swap(Src0Idx, Src1Idx);
2764
2765 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2766 static_cast<int>(Src0Idx) &&
2767 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2768 static_cast<int>(Src1Idx) &&
2769 "inconsistency with findCommutedOpIndices");
2770
2771 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2772 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2773
2774 MachineInstr *CommutedMI = nullptr;
2775 if (Src0.isReg() && Src1.isReg()) {
2776 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2777 // Be sure to copy the source modifiers to the right place.
2778 CommutedMI
2779 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2780 }
2781
2782 } else if (Src0.isReg() && !Src1.isReg()) {
2783 if (isOperandLegal(MI, Src1Idx, &Src0))
2784 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2785 } else if (!Src0.isReg() && Src1.isReg()) {
2786 if (isOperandLegal(MI, Src1Idx, &Src0))
2787 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2788 } else {
2789 // FIXME: Found two non registers to commute. This does happen.
2790 return nullptr;
2791 }
2792
2793 if (CommutedMI) {
2794 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2795 Src1, AMDGPU::OpName::src1_modifiers);
2796
2797 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2798 AMDGPU::OpName::src1_sel);
2799
2800 CommutedMI->setDesc(get(CommutedOpcode));
2801 }
2802
2803 return CommutedMI;
2804}
2805
2806// This needs to be implemented because the source modifiers may be inserted
2807// between the true commutable operands, and the base
2808// TargetInstrInfo::commuteInstruction uses it.
2810 unsigned &SrcOpIdx0,
2811 unsigned &SrcOpIdx1) const {
2812 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2813}
2814
2816 unsigned &SrcOpIdx0,
2817 unsigned &SrcOpIdx1) const {
2818 if (!Desc.isCommutable())
2819 return false;
2820
2821 unsigned Opc = Desc.getOpcode();
2822 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2823 if (Src0Idx == -1)
2824 return false;
2825
2826 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2827 if (Src1Idx == -1)
2828 return false;
2829
2830 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2831}
2832
2834 int64_t BrOffset) const {
2835 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2836 // block is unanalyzable.
2837 assert(BranchOp != AMDGPU::S_SETPC_B64);
2838
2839 // Convert to dwords.
2840 BrOffset /= 4;
2841
2842 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2843 // from the next instruction.
2844 BrOffset -= 1;
2845
2846 return isIntN(BranchOffsetBits, BrOffset);
2847}
2848
2851 return MI.getOperand(0).getMBB();
2852}
2853
2855 for (const MachineInstr &MI : MBB->terminators()) {
2856 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2857 MI.getOpcode() == AMDGPU::SI_LOOP)
2858 return true;
2859 }
2860 return false;
2861}
2862
2864 MachineBasicBlock &DestBB,
2865 MachineBasicBlock &RestoreBB,
2866 const DebugLoc &DL, int64_t BrOffset,
2867 RegScavenger *RS) const {
2868 assert(RS && "RegScavenger required for long branching");
2869 assert(MBB.empty() &&
2870 "new block should be inserted for expanding unconditional branch");
2871 assert(MBB.pred_size() == 1);
2872 assert(RestoreBB.empty() &&
2873 "restore block should be inserted for restoring clobbered registers");
2874
2878
2879 // FIXME: Virtual register workaround for RegScavenger not working with empty
2880 // blocks.
2881 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2882
2883 auto I = MBB.end();
2884
2885 // Note: as this is used after hazard recognizer we need to apply some hazard
2886 // workarounds directly.
2887 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2889 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2890 if (FlushSGPRWrites)
2891 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2893 };
2894
2895 // We need to compute the offset relative to the instruction immediately after
2896 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2897 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2898 ApplyHazardWorkarounds();
2899
2900 auto &MCCtx = MF->getContext();
2901 MCSymbol *PostGetPCLabel =
2902 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2903 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2904
2905 MCSymbol *OffsetLo =
2906 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2907 MCSymbol *OffsetHi =
2908 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2909 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2910 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2911 .addReg(PCReg, 0, AMDGPU::sub0)
2912 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2913 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2914 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2915 .addReg(PCReg, 0, AMDGPU::sub1)
2916 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2917 ApplyHazardWorkarounds();
2918
2919 // Insert the indirect branch after the other terminator.
2920 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2921 .addReg(PCReg);
2922
2923 // If a spill is needed for the pc register pair, we need to insert a spill
2924 // restore block right before the destination block, and insert a short branch
2925 // into the old destination block's fallthrough predecessor.
2926 // e.g.:
2927 //
2928 // s_cbranch_scc0 skip_long_branch:
2929 //
2930 // long_branch_bb:
2931 // spill s[8:9]
2932 // s_getpc_b64 s[8:9]
2933 // s_add_u32 s8, s8, restore_bb
2934 // s_addc_u32 s9, s9, 0
2935 // s_setpc_b64 s[8:9]
2936 //
2937 // skip_long_branch:
2938 // foo;
2939 //
2940 // .....
2941 //
2942 // dest_bb_fallthrough_predecessor:
2943 // bar;
2944 // s_branch dest_bb
2945 //
2946 // restore_bb:
2947 // restore s[8:9]
2948 // fallthrough dest_bb
2949 ///
2950 // dest_bb:
2951 // buzz;
2952
2953 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2954 Register Scav;
2955
2956 // If we've previously reserved a register for long branches
2957 // avoid running the scavenger and just use those registers
2958 if (LongBranchReservedReg) {
2959 RS->enterBasicBlock(MBB);
2960 Scav = LongBranchReservedReg;
2961 } else {
2963 Scav = RS->scavengeRegisterBackwards(
2964 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2965 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2966 }
2967 if (Scav) {
2968 RS->setRegUsed(Scav);
2969 MRI.replaceRegWith(PCReg, Scav);
2970 MRI.clearVirtRegs();
2971 } else {
2972 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2973 // SGPR spill.
2974 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2975 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2976 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2977 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2978 MRI.clearVirtRegs();
2979 }
2980
2981 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2982 // Now, the distance could be defined.
2984 MCSymbolRefExpr::create(DestLabel, MCCtx),
2985 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2986 // Add offset assignments.
2987 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2988 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2989 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2990 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2991}
2992
2993unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2994 switch (Cond) {
2995 case SIInstrInfo::SCC_TRUE:
2996 return AMDGPU::S_CBRANCH_SCC1;
2997 case SIInstrInfo::SCC_FALSE:
2998 return AMDGPU::S_CBRANCH_SCC0;
2999 case SIInstrInfo::VCCNZ:
3000 return AMDGPU::S_CBRANCH_VCCNZ;
3001 case SIInstrInfo::VCCZ:
3002 return AMDGPU::S_CBRANCH_VCCZ;
3003 case SIInstrInfo::EXECNZ:
3004 return AMDGPU::S_CBRANCH_EXECNZ;
3005 case SIInstrInfo::EXECZ:
3006 return AMDGPU::S_CBRANCH_EXECZ;
3007 default:
3008 llvm_unreachable("invalid branch predicate");
3009 }
3010}
3011
3012SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3013 switch (Opcode) {
3014 case AMDGPU::S_CBRANCH_SCC0:
3015 return SCC_FALSE;
3016 case AMDGPU::S_CBRANCH_SCC1:
3017 return SCC_TRUE;
3018 case AMDGPU::S_CBRANCH_VCCNZ:
3019 return VCCNZ;
3020 case AMDGPU::S_CBRANCH_VCCZ:
3021 return VCCZ;
3022 case AMDGPU::S_CBRANCH_EXECNZ:
3023 return EXECNZ;
3024 case AMDGPU::S_CBRANCH_EXECZ:
3025 return EXECZ;
3026 default:
3027 return INVALID_BR;
3028 }
3029}
3030
3034 MachineBasicBlock *&FBB,
3036 bool AllowModify) const {
3037 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3038 // Unconditional Branch
3039 TBB = I->getOperand(0).getMBB();
3040 return false;
3041 }
3042
3043 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3044 if (Pred == INVALID_BR)
3045 return true;
3046
3047 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3048 Cond.push_back(MachineOperand::CreateImm(Pred));
3049 Cond.push_back(I->getOperand(1)); // Save the branch register.
3050
3051 ++I;
3052
3053 if (I == MBB.end()) {
3054 // Conditional branch followed by fall-through.
3055 TBB = CondBB;
3056 return false;
3057 }
3058
3059 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3060 TBB = CondBB;
3061 FBB = I->getOperand(0).getMBB();
3062 return false;
3063 }
3064
3065 return true;
3066}
3067
3069 MachineBasicBlock *&FBB,
3071 bool AllowModify) const {
3073 auto E = MBB.end();
3074 if (I == E)
3075 return false;
3076
3077 // Skip over the instructions that are artificially terminators for special
3078 // exec management.
3079 while (I != E && !I->isBranch() && !I->isReturn()) {
3080 switch (I->getOpcode()) {
3081 case AMDGPU::S_MOV_B64_term:
3082 case AMDGPU::S_XOR_B64_term:
3083 case AMDGPU::S_OR_B64_term:
3084 case AMDGPU::S_ANDN2_B64_term:
3085 case AMDGPU::S_AND_B64_term:
3086 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3087 case AMDGPU::S_MOV_B32_term:
3088 case AMDGPU::S_XOR_B32_term:
3089 case AMDGPU::S_OR_B32_term:
3090 case AMDGPU::S_ANDN2_B32_term:
3091 case AMDGPU::S_AND_B32_term:
3092 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3093 break;
3094 case AMDGPU::SI_IF:
3095 case AMDGPU::SI_ELSE:
3096 case AMDGPU::SI_KILL_I1_TERMINATOR:
3097 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3098 // FIXME: It's messy that these need to be considered here at all.
3099 return true;
3100 default:
3101 llvm_unreachable("unexpected non-branch terminator inst");
3102 }
3103
3104 ++I;
3105 }
3106
3107 if (I == E)
3108 return false;
3109
3110 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3111}
3112
3114 int *BytesRemoved) const {
3115 unsigned Count = 0;
3116 unsigned RemovedSize = 0;
3118 // Skip over artificial terminators when removing instructions.
3119 if (MI.isBranch() || MI.isReturn()) {
3120 RemovedSize += getInstSizeInBytes(MI);
3121 MI.eraseFromParent();
3122 ++Count;
3123 }
3124 }
3125
3126 if (BytesRemoved)
3127 *BytesRemoved = RemovedSize;
3128
3129 return Count;
3130}
3131
3132// Copy the flags onto the implicit condition register operand.
3134 const MachineOperand &OrigCond) {
3135 CondReg.setIsUndef(OrigCond.isUndef());
3136 CondReg.setIsKill(OrigCond.isKill());
3137}
3138
3141 MachineBasicBlock *FBB,
3143 const DebugLoc &DL,
3144 int *BytesAdded) const {
3145 if (!FBB && Cond.empty()) {
3146 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3147 .addMBB(TBB);
3148 if (BytesAdded)
3149 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3150 return 1;
3151 }
3152
3153 assert(TBB && Cond[0].isImm());
3154
3155 unsigned Opcode
3156 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3157
3158 if (!FBB) {
3159 MachineInstr *CondBr =
3160 BuildMI(&MBB, DL, get(Opcode))
3161 .addMBB(TBB);
3162
3163 // Copy the flags onto the implicit condition register operand.
3164 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3165 fixImplicitOperands(*CondBr);
3166
3167 if (BytesAdded)
3168 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3169 return 1;
3170 }
3171
3172 assert(TBB && FBB);
3173
3174 MachineInstr *CondBr =
3175 BuildMI(&MBB, DL, get(Opcode))
3176 .addMBB(TBB);
3177 fixImplicitOperands(*CondBr);
3178 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3179 .addMBB(FBB);
3180
3181 MachineOperand &CondReg = CondBr->getOperand(1);
3182 CondReg.setIsUndef(Cond[1].isUndef());
3183 CondReg.setIsKill(Cond[1].isKill());
3184
3185 if (BytesAdded)
3186 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3187
3188 return 2;
3189}
3190
3193 if (Cond.size() != 2) {
3194 return true;
3195 }
3196
3197 if (Cond[0].isImm()) {
3198 Cond[0].setImm(-Cond[0].getImm());
3199 return false;
3200 }
3201
3202 return true;
3203}
3204
3207 Register DstReg, Register TrueReg,
3208 Register FalseReg, int &CondCycles,
3209 int &TrueCycles, int &FalseCycles) const {
3210 switch (Cond[0].getImm()) {
3211 case VCCNZ:
3212 case VCCZ: {
3214 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3215 if (MRI.getRegClass(FalseReg) != RC)
3216 return false;
3217
3218 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3219 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3220
3221 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3222 return RI.hasVGPRs(RC) && NumInsts <= 6;
3223 }
3224 case SCC_TRUE:
3225 case SCC_FALSE: {
3226 // FIXME: We could insert for VGPRs if we could replace the original compare
3227 // with a vector one.
3229 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3230 if (MRI.getRegClass(FalseReg) != RC)
3231 return false;
3232
3233 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3234
3235 // Multiples of 8 can do s_cselect_b64
3236 if (NumInsts % 2 == 0)
3237 NumInsts /= 2;
3238
3239 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3240 return RI.isSGPRClass(RC);
3241 }
3242 default:
3243 return false;
3244 }
3245}
3246
3250 Register TrueReg, Register FalseReg) const {
3251 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3252 if (Pred == VCCZ || Pred == SCC_FALSE) {
3253 Pred = static_cast<BranchPredicate>(-Pred);
3254 std::swap(TrueReg, FalseReg);
3255 }
3256
3258 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3259 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3260
3261 if (DstSize == 32) {
3263 if (Pred == SCC_TRUE) {
3264 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3265 .addReg(TrueReg)
3266 .addReg(FalseReg);
3267 } else {
3268 // Instruction's operands are backwards from what is expected.
3269 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3270 .addReg(FalseReg)
3271 .addReg(TrueReg);
3272 }
3273
3274 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3275 return;
3276 }
3277
3278 if (DstSize == 64 && Pred == SCC_TRUE) {
3280 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3281 .addReg(TrueReg)
3282 .addReg(FalseReg);
3283
3284 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3285 return;
3286 }
3287
3288 static const int16_t Sub0_15[] = {
3289 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3290 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3291 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3292 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3293 };
3294
3295 static const int16_t Sub0_15_64[] = {
3296 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3297 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3298 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3299 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3300 };
3301
3302 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3303 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3304 const int16_t *SubIndices = Sub0_15;
3305 int NElts = DstSize / 32;
3306
3307 // 64-bit select is only available for SALU.
3308 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3309 if (Pred == SCC_TRUE) {
3310 if (NElts % 2) {
3311 SelOp = AMDGPU::S_CSELECT_B32;
3312 EltRC = &AMDGPU::SGPR_32RegClass;
3313 } else {
3314 SelOp = AMDGPU::S_CSELECT_B64;
3315 EltRC = &AMDGPU::SGPR_64RegClass;
3316 SubIndices = Sub0_15_64;
3317 NElts /= 2;
3318 }
3319 }
3320
3322 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3323
3324 I = MIB->getIterator();
3325
3327 for (int Idx = 0; Idx != NElts; ++Idx) {
3328 Register DstElt = MRI.createVirtualRegister(EltRC);
3329 Regs.push_back(DstElt);
3330
3331 unsigned SubIdx = SubIndices[Idx];
3332
3334 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3335 Select =
3336 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3337 .addReg(FalseReg, 0, SubIdx)
3338 .addReg(TrueReg, 0, SubIdx);
3339 } else {
3340 Select =
3341 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3342 .addReg(TrueReg, 0, SubIdx)
3343 .addReg(FalseReg, 0, SubIdx);
3344 }
3345
3346 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3348
3349 MIB.addReg(DstElt)
3350 .addImm(SubIdx);
3351 }
3352}
3353
3355 switch (MI.getOpcode()) {
3356 case AMDGPU::V_MOV_B16_t16_e32:
3357 case AMDGPU::V_MOV_B16_t16_e64:
3358 case AMDGPU::V_MOV_B32_e32:
3359 case AMDGPU::V_MOV_B32_e64:
3360 case AMDGPU::V_MOV_B64_PSEUDO:
3361 case AMDGPU::V_MOV_B64_e32:
3362 case AMDGPU::V_MOV_B64_e64:
3363 case AMDGPU::S_MOV_B32:
3364 case AMDGPU::S_MOV_B64:
3365 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3366 case AMDGPU::COPY:
3367 case AMDGPU::WWM_COPY:
3368 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3369 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3370 case AMDGPU::V_ACCVGPR_MOV_B32:
3371 return true;
3372 default:
3373 return false;
3374 }
3375}
3376
3377static constexpr unsigned ModifierOpNames[] = {
3378 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3379 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3380 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3381
3383 unsigned Opc = MI.getOpcode();
3384 for (unsigned Name : reverse(ModifierOpNames)) {
3386 if (Idx >= 0)
3387 MI.removeOperand(Idx);
3388 }
3389}
3390
3392 Register Reg, MachineRegisterInfo *MRI) const {
3393 if (!MRI->hasOneNonDBGUse(Reg))
3394 return false;
3395
3396 switch (DefMI.getOpcode()) {
3397 default:
3398 return false;
3399 case AMDGPU::V_MOV_B64_e32:
3400 case AMDGPU::S_MOV_B64:
3401 case AMDGPU::V_MOV_B64_PSEUDO:
3402 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3403 case AMDGPU::V_MOV_B32_e32:
3404 case AMDGPU::S_MOV_B32:
3405 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3406 break;
3407 }
3408
3409 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3410 assert(ImmOp);
3411 // FIXME: We could handle FrameIndex values here.
3412 if (!ImmOp->isImm())
3413 return false;
3414
3415 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3416 int64_t Imm = ImmOp->getImm();
3417 switch (UseOp.getSubReg()) {
3418 default:
3419 return Imm;
3420 case AMDGPU::sub0:
3421 return Lo_32(Imm);
3422 case AMDGPU::sub1:
3423 return Hi_32(Imm);
3424 case AMDGPU::lo16:
3425 return SignExtend64<16>(Imm);
3426 case AMDGPU::hi16:
3427 return SignExtend64<16>(Imm >> 16);
3428 case AMDGPU::sub1_lo16:
3429 return SignExtend64<16>(Imm >> 32);
3430 case AMDGPU::sub1_hi16:
3431 return SignExtend64<16>(Imm >> 48);
3432 }
3433 };
3434
3435 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3436
3437 unsigned Opc = UseMI.getOpcode();
3438 if (Opc == AMDGPU::COPY) {
3439 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3440
3441 Register DstReg = UseMI.getOperand(0).getReg();
3442 unsigned OpSize = getOpSize(UseMI, 0);
3443 bool Is16Bit = OpSize == 2;
3444 bool Is64Bit = OpSize == 8;
3445 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3446 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3447 : AMDGPU::V_MOV_B32_e32
3448 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3449 : AMDGPU::S_MOV_B32;
3450 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)),
3451 /*isSigned=*/true, /*implicitTrunc=*/true);
3452
3453 if (RI.isAGPR(*MRI, DstReg)) {
3454 if (Is64Bit || !isInlineConstant(Imm))
3455 return false;
3456 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3457 }
3458
3459 if (Is16Bit) {
3460 if (isVGPRCopy)
3461 return false; // Do not clobber vgpr_hi16
3462
3463 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3464 return false;
3465
3466 UseMI.getOperand(0).setSubReg(0);
3467 if (DstReg.isPhysical()) {
3468 DstReg = RI.get32BitRegister(DstReg);
3469 UseMI.getOperand(0).setReg(DstReg);
3470 }
3471 assert(UseMI.getOperand(1).getReg().isVirtual());
3472 }
3473
3474 const MCInstrDesc &NewMCID = get(NewOpc);
3475 if (DstReg.isPhysical() &&
3476 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3477 return false;
3478
3479 UseMI.setDesc(NewMCID);
3480 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3481 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3482 return true;
3483 }
3484
3485 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3486 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3487 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3488 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3489 Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
3490 // Don't fold if we are using source or output modifiers. The new VOP2
3491 // instructions don't have them.
3493 return false;
3494
3495 // If this is a free constant, there's no reason to do this.
3496 // TODO: We could fold this here instead of letting SIFoldOperands do it
3497 // later.
3498 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3499
3500 // Any src operand can be used for the legality check.
3501 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3502 return false;
3503
3504 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3505 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3506 bool IsFMA =
3507 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3508 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3509 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3510 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3511 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3512
3513 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3514 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3515 (Src1->isReg() && Src1->getReg() == Reg)) {
3516 MachineOperand *RegSrc =
3517 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3518 if (!RegSrc->isReg())
3519 return false;
3520 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3521 ST.getConstantBusLimit(Opc) < 2)
3522 return false;
3523
3524 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3525 return false;
3526
3527 // If src2 is also a literal constant then we have to choose which one to
3528 // fold. In general it is better to choose madak so that the other literal
3529 // can be materialized in an sgpr instead of a vgpr:
3530 // s_mov_b32 s0, literal
3531 // v_madak_f32 v0, s0, v0, literal
3532 // Instead of:
3533 // v_mov_b32 v1, literal
3534 // v_madmk_f32 v0, v0, literal, v1
3535 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3536 if (Def && Def->isMoveImmediate() &&
3537 !isInlineConstant(Def->getOperand(1)))
3538 return false;
3539
3540 unsigned NewOpc =
3541 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3542 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3543 : AMDGPU::V_FMAMK_F16)
3544 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3545 if (pseudoToMCOpcode(NewOpc) == -1)
3546 return false;
3547
3548 // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3549 // would also require restricting their register classes. For now
3550 // just bail out.
3551 if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3552 return false;
3553
3554 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3555
3556 // FIXME: This would be a lot easier if we could return a new instruction
3557 // instead of having to modify in place.
3558
3559 Register SrcReg = RegSrc->getReg();
3560 unsigned SrcSubReg = RegSrc->getSubReg();
3561 Src0->setReg(SrcReg);
3562 Src0->setSubReg(SrcSubReg);
3563 Src0->setIsKill(RegSrc->isKill());
3564
3565 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3566 Opc == AMDGPU::V_FMAC_F32_e64 ||
3567 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3568 UseMI.untieRegOperand(
3569 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3570
3571 Src1->ChangeToImmediate(Imm);
3572
3574 UseMI.setDesc(get(NewOpc));
3575
3576 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3577 if (DeleteDef)
3578 DefMI.eraseFromParent();
3579
3580 return true;
3581 }
3582
3583 // Added part is the constant: Use v_madak_{f16, f32}.
3584 if (Src2->isReg() && Src2->getReg() == Reg) {
3585 if (ST.getConstantBusLimit(Opc) < 2) {
3586 // Not allowed to use constant bus for another operand.
3587 // We can however allow an inline immediate as src0.
3588 bool Src0Inlined = false;
3589 if (Src0->isReg()) {
3590 // Try to inline constant if possible.
3591 // If the Def moves immediate and the use is single
3592 // We are saving VGPR here.
3593 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3594 if (Def && Def->isMoveImmediate() &&
3595 isInlineConstant(Def->getOperand(1)) &&
3596 MRI->hasOneUse(Src0->getReg())) {
3597 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3598 Src0Inlined = true;
3599 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3600 RI.isSGPRReg(*MRI, Src0->getReg())) {
3601 return false;
3602 }
3603 // VGPR is okay as Src0 - fallthrough
3604 }
3605
3606 if (Src1->isReg() && !Src0Inlined) {
3607 // We have one slot for inlinable constant so far - try to fill it
3608 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3609 if (Def && Def->isMoveImmediate() &&
3610 isInlineConstant(Def->getOperand(1)) &&
3611 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3612 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3613 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3614 return false;
3615 // VGPR is okay as Src1 - fallthrough
3616 }
3617 }
3618
3619 unsigned NewOpc =
3620 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3621 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3622 : AMDGPU::V_FMAAK_F16)
3623 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3624 if (pseudoToMCOpcode(NewOpc) == -1)
3625 return false;
3626
3627 // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3628 // would also require restricting their register classes. For now
3629 // just bail out.
3630 if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3631 return false;
3632
3633 // FIXME: This would be a lot easier if we could return a new instruction
3634 // instead of having to modify in place.
3635
3636 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3637 Opc == AMDGPU::V_FMAC_F32_e64 ||
3638 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3639 UseMI.untieRegOperand(
3640 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3641
3642 // ChangingToImmediate adds Src2 back to the instruction.
3643 Src2->ChangeToImmediate(getImmFor(*Src2));
3644
3645 // These come before src2.
3647 UseMI.setDesc(get(NewOpc));
3648 // It might happen that UseMI was commuted
3649 // and we now have SGPR as SRC1. If so 2 inlined
3650 // constant and SGPR are illegal.
3652
3653 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3654 if (DeleteDef)
3655 DefMI.eraseFromParent();
3656
3657 return true;
3658 }
3659 }
3660
3661 return false;
3662}
3663
3664static bool
3667 if (BaseOps1.size() != BaseOps2.size())
3668 return false;
3669 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3670 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3671 return false;
3672 }
3673 return true;
3674}
3675
3676static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3677 LocationSize WidthB, int OffsetB) {
3678 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3679 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3680 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3681 return LowWidth.hasValue() &&
3682 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3683}
3684
3685bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3686 const MachineInstr &MIb) const {
3687 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3688 int64_t Offset0, Offset1;
3689 LocationSize Dummy0 = 0, Dummy1 = 0;
3690 bool Offset0IsScalable, Offset1IsScalable;
3691 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3692 Dummy0, &RI) ||
3693 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3694 Dummy1, &RI))
3695 return false;
3696
3697 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3698 return false;
3699
3700 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3701 // FIXME: Handle ds_read2 / ds_write2.
3702 return false;
3703 }
3704 LocationSize Width0 = MIa.memoperands().front()->getSize();
3705 LocationSize Width1 = MIb.memoperands().front()->getSize();
3706 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3707}
3708
3710 const MachineInstr &MIb) const {
3711 assert(MIa.mayLoadOrStore() &&
3712 "MIa must load from or modify a memory location");
3713 assert(MIb.mayLoadOrStore() &&
3714 "MIb must load from or modify a memory location");
3715
3717 return false;
3718
3719 // XXX - Can we relax this between address spaces?
3720 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3721 return false;
3722
3723 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3724 return false;
3725
3726 // TODO: Should we check the address space from the MachineMemOperand? That
3727 // would allow us to distinguish objects we know don't alias based on the
3728 // underlying address space, even if it was lowered to a different one,
3729 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3730 // buffer.
3731 if (isDS(MIa)) {
3732 if (isDS(MIb))
3733 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3734
3735 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3736 }
3737
3738 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3739 if (isMUBUF(MIb) || isMTBUF(MIb))
3740 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3741
3742 if (isFLAT(MIb))
3743 return isFLATScratch(MIb);
3744
3745 return !isSMRD(MIb);
3746 }
3747
3748 if (isSMRD(MIa)) {
3749 if (isSMRD(MIb))
3750 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3751
3752 if (isFLAT(MIb))
3753 return isFLATScratch(MIb);
3754
3755 return !isMUBUF(MIb) && !isMTBUF(MIb);
3756 }
3757
3758 if (isFLAT(MIa)) {
3759 if (isFLAT(MIb)) {
3760 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3761 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3762 return true;
3763
3764 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3765 }
3766
3767 return false;
3768 }
3769
3770 return false;
3771}
3772
3774 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3775 if (Reg.isPhysical())
3776 return false;
3777 auto *Def = MRI.getUniqueVRegDef(Reg);
3778 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3779 Imm = Def->getOperand(1).getImm();
3780 if (DefMI)
3781 *DefMI = Def;
3782 return true;
3783 }
3784 return false;
3785}
3786
3787static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3788 MachineInstr **DefMI = nullptr) {
3789 if (!MO->isReg())
3790 return false;
3791 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3792 const MachineRegisterInfo &MRI = MF->getRegInfo();
3793 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3794}
3795
3797 MachineInstr &NewMI) {
3798 if (LV) {
3799 unsigned NumOps = MI.getNumOperands();
3800 for (unsigned I = 1; I < NumOps; ++I) {
3801 MachineOperand &Op = MI.getOperand(I);
3802 if (Op.isReg() && Op.isKill())
3803 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3804 }
3805 }
3806}
3807
3808static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
3809 switch (Opc) {
3810 case AMDGPU::V_MAC_F16_e32:
3811 case AMDGPU::V_MAC_F16_e64:
3812 return AMDGPU::V_MAD_F16_e64;
3813 case AMDGPU::V_MAC_F32_e32:
3814 case AMDGPU::V_MAC_F32_e64:
3815 return AMDGPU::V_MAD_F32_e64;
3816 case AMDGPU::V_MAC_LEGACY_F32_e32:
3817 case AMDGPU::V_MAC_LEGACY_F32_e64:
3818 return AMDGPU::V_MAD_LEGACY_F32_e64;
3819 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3820 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3821 return AMDGPU::V_FMA_LEGACY_F32_e64;
3822 case AMDGPU::V_FMAC_F16_e32:
3823 case AMDGPU::V_FMAC_F16_e64:
3824 case AMDGPU::V_FMAC_F16_fake16_e64:
3825 return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3826 : AMDGPU::V_FMA_F16_gfx9_e64;
3827 case AMDGPU::V_FMAC_F32_e32:
3828 case AMDGPU::V_FMAC_F32_e64:
3829 return AMDGPU::V_FMA_F32_e64;
3830 case AMDGPU::V_FMAC_F64_e32:
3831 case AMDGPU::V_FMAC_F64_e64:
3832 return AMDGPU::V_FMA_F64_e64;
3833 default:
3834 llvm_unreachable("invalid instruction");
3835 }
3836}
3837
3839 LiveVariables *LV,
3840 LiveIntervals *LIS) const {
3841 MachineBasicBlock &MBB = *MI.getParent();
3842 unsigned Opc = MI.getOpcode();
3843
3844 // Handle MFMA.
3845 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3846 if (NewMFMAOpc != -1) {
3848 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3849 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3850 MIB.add(MI.getOperand(I));
3851 updateLiveVariables(LV, MI, *MIB);
3852 if (LIS) {
3853 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3854 // SlotIndex of defs needs to be updated when converting to early-clobber
3855 MachineOperand &Def = MIB->getOperand(0);
3856 if (Def.isEarlyClobber() && Def.isReg() &&
3857 LIS->hasInterval(Def.getReg())) {
3858 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3859 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3860 auto &LI = LIS->getInterval(Def.getReg());
3861 auto UpdateDefIndex = [&](LiveRange &LR) {
3862 auto *S = LR.find(OldIndex);
3863 if (S != LR.end() && S->start == OldIndex) {
3864 assert(S->valno && S->valno->def == OldIndex);
3865 S->start = NewIndex;
3866 S->valno->def = NewIndex;
3867 }
3868 };
3869 UpdateDefIndex(LI);
3870 for (auto &SR : LI.subranges())
3871 UpdateDefIndex(SR);
3872 }
3873 }
3874 return MIB;
3875 }
3876
3877 if (SIInstrInfo::isWMMA(MI)) {
3878 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3879 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3880 .setMIFlags(MI.getFlags());
3881 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3882 MIB->addOperand(MI.getOperand(I));
3883
3884 updateLiveVariables(LV, MI, *MIB);
3885 if (LIS)
3886 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3887
3888 return MIB;
3889 }
3890
3891 assert(
3892 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3893 "V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3894 "pre-RA");
3895
3896 // Handle MAC/FMAC.
3897 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3898 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3899 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3900 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3901 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3902 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3903 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3904 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3905 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3906 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3907 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3908 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3909 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3910 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3911 bool Src0Literal = false;
3912
3913 switch (Opc) {
3914 default:
3915 return nullptr;
3916 case AMDGPU::V_MAC_F16_e64:
3917 case AMDGPU::V_FMAC_F16_e64:
3918 case AMDGPU::V_FMAC_F16_fake16_e64:
3919 case AMDGPU::V_MAC_F32_e64:
3920 case AMDGPU::V_MAC_LEGACY_F32_e64:
3921 case AMDGPU::V_FMAC_F32_e64:
3922 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3923 case AMDGPU::V_FMAC_F64_e64:
3924 break;
3925 case AMDGPU::V_MAC_F16_e32:
3926 case AMDGPU::V_FMAC_F16_e32:
3927 case AMDGPU::V_MAC_F32_e32:
3928 case AMDGPU::V_MAC_LEGACY_F32_e32:
3929 case AMDGPU::V_FMAC_F32_e32:
3930 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3931 case AMDGPU::V_FMAC_F64_e32: {
3932 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3933 AMDGPU::OpName::src0);
3934 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3935 if (!Src0->isReg() && !Src0->isImm())
3936 return nullptr;
3937
3938 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3939 Src0Literal = true;
3940
3941 break;
3942 }
3943 }
3944
3946 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3947 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3948 const MachineOperand *Src0Mods =
3949 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3950 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3951 const MachineOperand *Src1Mods =
3952 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3953 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3954 const MachineOperand *Src2Mods =
3955 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3956 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3957 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3958 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3959
3960 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3961 !IsLegacy &&
3962 // If we have an SGPR input, we will violate the constant bus restriction.
3963 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3964 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3966 const auto killDef = [&]() -> void {
3968 // The only user is the instruction which will be killed.
3969 Register DefReg = DefMI->getOperand(0).getReg();
3970
3971 if (MRI.hasOneNonDBGUse(DefReg)) {
3972 // We cannot just remove the DefMI here, calling pass will crash.
3973 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3974 DefMI->getOperand(0).setIsDead(true);
3975 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3977 if (LV)
3978 LV->getVarInfo(DefReg).AliveBlocks.clear();
3979 }
3980
3981 if (LIS) {
3982 LiveInterval &DefLI = LIS->getInterval(DefReg);
3983
3984 // We cannot delete the original instruction here, so hack out the use
3985 // in the original instruction with a dummy register so we can use
3986 // shrinkToUses to deal with any multi-use edge cases. Other targets do
3987 // not have the complexity of deleting a use to consider here.
3988 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
3989 for (MachineOperand &MIOp : MI.uses()) {
3990 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
3991 MIOp.setIsUndef(true);
3992 MIOp.setReg(DummyReg);
3993 }
3994 }
3995
3996 LIS->shrinkToUses(&DefLI);
3997 }
3998 };
3999
4000 int64_t Imm;
4001 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4002 unsigned NewOpc =
4003 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
4004 : AMDGPU::V_FMAAK_F16)
4005 : AMDGPU::V_FMAAK_F32)
4006 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
4007 if (pseudoToMCOpcode(NewOpc) != -1) {
4008 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4009 .add(*Dst)
4010 .add(*Src0)
4011 .add(*Src1)
4012 .addImm(Imm)
4013 .setMIFlags(MI.getFlags());
4014 updateLiveVariables(LV, MI, *MIB);
4015 if (LIS)
4016 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4017 killDef();
4018 return MIB;
4019 }
4020 }
4021 unsigned NewOpc =
4022 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
4023 : AMDGPU::V_FMAMK_F16)
4024 : AMDGPU::V_FMAMK_F32)
4025 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4026 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4027 if (pseudoToMCOpcode(NewOpc) != -1) {
4028 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4029 .add(*Dst)
4030 .add(*Src0)
4031 .addImm(Imm)
4032 .add(*Src2)
4033 .setMIFlags(MI.getFlags());
4034 updateLiveVariables(LV, MI, *MIB);
4035
4036 if (LIS)
4037 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4038 killDef();
4039 return MIB;
4040 }
4041 }
4042 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4043 if (Src0Literal) {
4044 Imm = Src0->getImm();
4045 DefMI = nullptr;
4046 }
4047 if (pseudoToMCOpcode(NewOpc) != -1 &&
4049 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4050 Src1)) {
4051 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4052 .add(*Dst)
4053 .add(*Src1)
4054 .addImm(Imm)
4055 .add(*Src2)
4056 .setMIFlags(MI.getFlags());
4057 updateLiveVariables(LV, MI, *MIB);
4058
4059 if (LIS)
4060 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4061 if (DefMI)
4062 killDef();
4063 return MIB;
4064 }
4065 }
4066 }
4067
4068 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4069 // if VOP3 does not allow a literal operand.
4070 if (Src0Literal && !ST.hasVOP3Literal())
4071 return nullptr;
4072
4073 unsigned NewOpc = getNewFMAInst(ST, Opc);
4074
4075 if (pseudoToMCOpcode(NewOpc) == -1)
4076 return nullptr;
4077
4078 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4079 .add(*Dst)
4080 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4081 .add(*Src0)
4082 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4083 .add(*Src1)
4084 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4085 .add(*Src2)
4086 .addImm(Clamp ? Clamp->getImm() : 0)
4087 .addImm(Omod ? Omod->getImm() : 0)
4088 .setMIFlags(MI.getFlags());
4089 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4090 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4091 updateLiveVariables(LV, MI, *MIB);
4092 if (LIS)
4093 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4094 return MIB;
4095}
4096
4097// It's not generally safe to move VALU instructions across these since it will
4098// start using the register as a base index rather than directly.
4099// XXX - Why isn't hasSideEffects sufficient for these?
4101 switch (MI.getOpcode()) {
4102 case AMDGPU::S_SET_GPR_IDX_ON:
4103 case AMDGPU::S_SET_GPR_IDX_MODE:
4104 case AMDGPU::S_SET_GPR_IDX_OFF:
4105 return true;
4106 default:
4107 return false;
4108 }
4109}
4110
4112 const MachineBasicBlock *MBB,
4113 const MachineFunction &MF) const {
4114 // Skipping the check for SP writes in the base implementation. The reason it
4115 // was added was apparently due to compile time concerns.
4116 //
4117 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4118 // but is probably avoidable.
4119
4120 // Copied from base implementation.
4121 // Terminators and labels can't be scheduled around.
4122 if (MI.isTerminator() || MI.isPosition())
4123 return true;
4124
4125 // INLINEASM_BR can jump to another block
4126 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4127 return true;
4128
4129 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4130 return true;
4131
4132 // Target-independent instructions do not have an implicit-use of EXEC, even
4133 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4134 // boundaries prevents incorrect movements of such instructions.
4135 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4136 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4137 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4138 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4140}
4141
4143 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4144}
4145
4147 // Skip the full operand and register alias search modifiesRegister
4148 // does. There's only a handful of instructions that touch this, it's only an
4149 // implicit def, and doesn't alias any other registers.
4150 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4151}
4152
4154 unsigned Opcode = MI.getOpcode();
4155
4156 if (MI.mayStore() && isSMRD(MI))
4157 return true; // scalar store or atomic
4158
4159 // This will terminate the function when other lanes may need to continue.
4160 if (MI.isReturn())
4161 return true;
4162
4163 // These instructions cause shader I/O that may cause hardware lockups
4164 // when executed with an empty EXEC mask.
4165 //
4166 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4167 // EXEC = 0, but checking for that case here seems not worth it
4168 // given the typical code patterns.
4169 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4170 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4171 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4172 return true;
4173
4174 if (MI.isCall() || MI.isInlineAsm())
4175 return true; // conservative assumption
4176
4177 // Assume that barrier interactions are only intended with active lanes.
4178 if (isBarrier(Opcode))
4179 return true;
4180
4181 // A mode change is a scalar operation that influences vector instructions.
4183 return true;
4184
4185 // These are like SALU instructions in terms of effects, so it's questionable
4186 // whether we should return true for those.
4187 //
4188 // However, executing them with EXEC = 0 causes them to operate on undefined
4189 // data, which we avoid by returning true here.
4190 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4191 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4192 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4193 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4194 return true;
4195
4196 return false;
4197}
4198
4200 const MachineInstr &MI) const {
4201 if (MI.isMetaInstruction())
4202 return false;
4203
4204 // This won't read exec if this is an SGPR->SGPR copy.
4205 if (MI.isCopyLike()) {
4206 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4207 return true;
4208
4209 // Make sure this isn't copying exec as a normal operand
4210 return MI.readsRegister(AMDGPU::EXEC, &RI);
4211 }
4212
4213 // Make a conservative assumption about the callee.
4214 if (MI.isCall())
4215 return true;
4216
4217 // Be conservative with any unhandled generic opcodes.
4218 if (!isTargetSpecificOpcode(MI.getOpcode()))
4219 return true;
4220
4221 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4222}
4223
4224bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4225 switch (Imm.getBitWidth()) {
4226 case 1: // This likely will be a condition code mask.
4227 return true;
4228
4229 case 32:
4230 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4231 ST.hasInv2PiInlineImm());
4232 case 64:
4233 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4234 ST.hasInv2PiInlineImm());
4235 case 16:
4236 return ST.has16BitInsts() &&
4237 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4238 ST.hasInv2PiInlineImm());
4239 default:
4240 llvm_unreachable("invalid bitwidth");
4241 }
4242}
4243
4245 APInt IntImm = Imm.bitcastToAPInt();
4246 int64_t IntImmVal = IntImm.getSExtValue();
4247 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4248 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4249 default:
4250 llvm_unreachable("invalid fltSemantics");
4253 return isInlineConstant(IntImm);
4255 return ST.has16BitInsts() &&
4256 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4258 return ST.has16BitInsts() &&
4259 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4260 }
4261}
4262
4264 uint8_t OperandType) const {
4265 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4266 if (!MO.isImm())
4267 return false;
4268
4269 // MachineOperand provides no way to tell the true operand size, since it only
4270 // records a 64-bit value. We need to know the size to determine if a 32-bit
4271 // floating point immediate bit pattern is legal for an integer immediate. It
4272 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4273
4274 int64_t Imm = MO.getImm();
4275 switch (OperandType) {
4288 int32_t Trunc = static_cast<int32_t>(Imm);
4290 }
4297 ST.hasInv2PiInlineImm());
4301 // We would expect inline immediates to not be concerned with an integer/fp
4302 // distinction. However, in the case of 16-bit integer operations, the
4303 // "floating point" values appear to not work. It seems read the low 16-bits
4304 // of 32-bit immediates, which happens to always work for the integer
4305 // values.
4306 //
4307 // See llvm bugzilla 46302.
4308 //
4309 // TODO: Theoretically we could use op-sel to use the high bits of the
4310 // 32-bit FP values.
4328 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4329 // A few special case instructions have 16-bit operands on subtargets
4330 // where 16-bit instructions are not legal.
4331 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4332 // constants in these cases
4333 int16_t Trunc = static_cast<int16_t>(Imm);
4334 return ST.has16BitInsts() &&
4336 }
4337
4338 return false;
4339 }
4344 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4345 int16_t Trunc = static_cast<int16_t>(Imm);
4346 return ST.has16BitInsts() &&
4348 }
4349 return false;
4350 }
4353 return false;
4356 // Always embedded in the instruction for free.
4357 return true;
4367 // Just ignore anything else.
4368 return true;
4369 default:
4370 llvm_unreachable("invalid operand type");
4371 }
4372}
4373
4374static bool compareMachineOp(const MachineOperand &Op0,
4375 const MachineOperand &Op1) {
4376 if (Op0.getType() != Op1.getType())
4377 return false;
4378
4379 switch (Op0.getType()) {
4381 return Op0.getReg() == Op1.getReg();
4383 return Op0.getImm() == Op1.getImm();
4384 default:
4385 llvm_unreachable("Didn't expect to be comparing these operand types");
4386 }
4387}
4388
4390 const MachineOperand &MO) const {
4391 const MCInstrDesc &InstDesc = MI.getDesc();
4392 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4393
4394 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4395
4397 return true;
4398
4399 if (OpInfo.RegClass < 0)
4400 return false;
4401
4402 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4403 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4404 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4405 AMDGPU::OpName::src2))
4406 return false;
4407 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4408 }
4409
4410 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4411 return false;
4412
4413 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4414 return true;
4415
4416 return ST.hasVOP3Literal();
4417}
4418
4419bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4420 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4421 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4422 return false;
4423
4424 int Op32 = AMDGPU::getVOPe32(Opcode);
4425 if (Op32 == -1)
4426 return false;
4427
4428 return pseudoToMCOpcode(Op32) != -1;
4429}
4430
4431bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4432 // The src0_modifier operand is present on all instructions
4433 // that have modifiers.
4434
4435 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4436}
4437
4439 unsigned OpName) const {
4440 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4441 return Mods && Mods->getImm();
4442}
4443
4445 return any_of(ModifierOpNames,
4446 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4447}
4448
4450 const MachineRegisterInfo &MRI) const {
4451 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4452 // Can't shrink instruction with three operands.
4453 if (Src2) {
4454 switch (MI.getOpcode()) {
4455 default: return false;
4456
4457 case AMDGPU::V_ADDC_U32_e64:
4458 case AMDGPU::V_SUBB_U32_e64:
4459 case AMDGPU::V_SUBBREV_U32_e64: {
4460 const MachineOperand *Src1
4461 = getNamedOperand(MI, AMDGPU::OpName::src1);
4462 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4463 return false;
4464 // Additional verification is needed for sdst/src2.
4465 return true;
4466 }
4467 case AMDGPU::V_MAC_F16_e64:
4468 case AMDGPU::V_MAC_F32_e64:
4469 case AMDGPU::V_MAC_LEGACY_F32_e64:
4470 case AMDGPU::V_FMAC_F16_e64:
4471 case AMDGPU::V_FMAC_F16_fake16_e64:
4472 case AMDGPU::V_FMAC_F32_e64:
4473 case AMDGPU::V_FMAC_F64_e64:
4474 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4475 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4476 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4477 return false;
4478 break;
4479
4480 case AMDGPU::V_CNDMASK_B32_e64:
4481 break;
4482 }
4483 }
4484
4485 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4486 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4487 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4488 return false;
4489
4490 // We don't need to check src0, all input types are legal, so just make sure
4491 // src0 isn't using any modifiers.
4492 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4493 return false;
4494
4495 // Can it be shrunk to a valid 32 bit opcode?
4496 if (!hasVALU32BitEncoding(MI.getOpcode()))
4497 return false;
4498
4499 // Check output modifiers
4500 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4501 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4502 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4503 // TODO: Can we avoid checking bound_ctrl/fi here?
4504 // They are only used by permlane*_swap special case.
4505 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4506 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4507}
4508
4509// Set VCC operand with all flags from \p Orig, except for setting it as
4510// implicit.
4512 const MachineOperand &Orig) {
4513
4514 for (MachineOperand &Use : MI.implicit_operands()) {
4515 if (Use.isUse() &&
4516 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4517 Use.setIsUndef(Orig.isUndef());
4518 Use.setIsKill(Orig.isKill());
4519 return;
4520 }
4521 }
4522}
4523
4525 unsigned Op32) const {
4526 MachineBasicBlock *MBB = MI.getParent();
4527
4528 const MCInstrDesc &Op32Desc = get(Op32);
4529 MachineInstrBuilder Inst32 =
4530 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4531 .setMIFlags(MI.getFlags());
4532
4533 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4534 // For VOPC instructions, this is replaced by an implicit def of vcc.
4535
4536 // We assume the defs of the shrunk opcode are in the same order, and the
4537 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4538 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4539 Inst32.add(MI.getOperand(I));
4540
4541 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4542
4543 int Idx = MI.getNumExplicitDefs();
4544 for (const MachineOperand &Use : MI.explicit_uses()) {
4545 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4547 continue;
4548
4549 if (&Use == Src2) {
4550 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4551 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4552 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4553 // of vcc was already added during the initial BuildMI, but we
4554 // 1) may need to change vcc to vcc_lo to preserve the original register
4555 // 2) have to preserve the original flags.
4556 copyFlagsToImplicitVCC(*Inst32, *Src2);
4557 continue;
4558 }
4559 }
4560
4561 Inst32.add(Use);
4562 }
4563
4564 // FIXME: Losing implicit operands
4565 fixImplicitOperands(*Inst32);
4566 return Inst32;
4567}
4568
4570 const MachineOperand &MO,
4571 const MCOperandInfo &OpInfo) const {
4572 // Literal constants use the constant bus.
4573 if (!MO.isReg())
4574 return !isInlineConstant(MO, OpInfo);
4575
4576 if (!MO.isUse())
4577 return false;
4578
4579 if (MO.getReg().isVirtual())
4580 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4581
4582 // Null is free
4583 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4584 return false;
4585
4586 // SGPRs use the constant bus
4587 if (MO.isImplicit()) {
4588 return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
4589 MO.getReg() == AMDGPU::VCC_LO;
4590 }
4591 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4592 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4593}
4594
4596 for (const MachineOperand &MO : MI.implicit_operands()) {
4597 // We only care about reads.
4598 if (MO.isDef())
4599 continue;
4600
4601 switch (MO.getReg()) {
4602 case AMDGPU::VCC:
4603 case AMDGPU::VCC_LO:
4604 case AMDGPU::VCC_HI:
4605 case AMDGPU::M0:
4606 case AMDGPU::FLAT_SCR:
4607 return MO.getReg();
4608
4609 default:
4610 break;
4611 }
4612 }
4613
4614 return Register();
4615}
4616
4617static bool shouldReadExec(const MachineInstr &MI) {
4618 if (SIInstrInfo::isVALU(MI)) {
4619 switch (MI.getOpcode()) {
4620 case AMDGPU::V_READLANE_B32:
4621 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4622 case AMDGPU::V_WRITELANE_B32:
4623 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4624 return false;
4625 }
4626
4627 return true;
4628 }
4629
4630 if (MI.isPreISelOpcode() ||
4631 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4634 return false;
4635
4636 return true;
4637}
4638
4639static bool isRegOrFI(const MachineOperand &MO) {
4640 return MO.isReg() || MO.isFI();
4641}
4642
4643static bool isSubRegOf(const SIRegisterInfo &TRI,
4644 const MachineOperand &SuperVec,
4645 const MachineOperand &SubReg) {
4646 if (SubReg.getReg().isPhysical())
4647 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4648
4649 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4650 SubReg.getReg() == SuperVec.getReg();
4651}
4652
4653// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4654bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4655 const MachineRegisterInfo &MRI,
4656 StringRef &ErrInfo) const {
4657 Register DstReg = MI.getOperand(0).getReg();
4658 Register SrcReg = MI.getOperand(1).getReg();
4659 // This is a check for copy from vector register to SGPR
4660 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4661 ErrInfo = "illegal copy from vector register to SGPR";
4662 return false;
4663 }
4664 return true;
4665}
4666
4668 StringRef &ErrInfo) const {
4669 uint16_t Opcode = MI.getOpcode();
4670 const MachineFunction *MF = MI.getParent()->getParent();
4671 const MachineRegisterInfo &MRI = MF->getRegInfo();
4672
4673 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4674 // Find a better property to recognize the point where instruction selection
4675 // is just done.
4676 // We can only enforce this check after SIFixSGPRCopies pass so that the
4677 // illegal copies are legalized and thereafter we don't expect a pass
4678 // inserting similar copies.
4679 if (!MRI.isSSA() && MI.isCopy())
4680 return verifyCopy(MI, MRI, ErrInfo);
4681
4682 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4683 return true;
4684
4685 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4686 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4687 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4688 int Src3Idx = -1;
4689 if (Src0Idx == -1) {
4690 // VOPD V_DUAL_* instructions use different operand names.
4691 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4692 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4693 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4694 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4695 }
4696
4697 // Make sure the number of operands is correct.
4698 const MCInstrDesc &Desc = get(Opcode);
4699 if (!Desc.isVariadic() &&
4700 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4701 ErrInfo = "Instruction has wrong number of operands.";
4702 return false;
4703 }
4704
4705 if (MI.isInlineAsm()) {
4706 // Verify register classes for inlineasm constraints.
4707 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4708 I != E; ++I) {
4709 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4710 if (!RC)
4711 continue;
4712
4713 const MachineOperand &Op = MI.getOperand(I);
4714 if (!Op.isReg())
4715 continue;
4716
4717 Register Reg = Op.getReg();
4718 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4719 ErrInfo = "inlineasm operand has incorrect register class.";
4720 return false;
4721 }
4722 }
4723
4724 return true;
4725 }
4726
4727 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4728 ErrInfo = "missing memory operand from image instruction.";
4729 return false;
4730 }
4731
4732 // Make sure the register classes are correct.
4733 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4734 const MachineOperand &MO = MI.getOperand(i);
4735 if (MO.isFPImm()) {
4736 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4737 "all fp values to integers.";
4738 return false;
4739 }
4740
4741 int RegClass = Desc.operands()[i].RegClass;
4742
4743 switch (Desc.operands()[i].OperandType) {
4745 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4746 ErrInfo = "Illegal immediate value for operand.";
4747 return false;
4748 }
4749 break;
4754 break;
4766 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4767 ErrInfo = "Illegal immediate value for operand.";
4768 return false;
4769 }
4770 break;
4771 }
4773 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4774 ErrInfo = "Expected inline constant for operand.";
4775 return false;
4776 }
4777 break;
4780 // Check if this operand is an immediate.
4781 // FrameIndex operands will be replaced by immediates, so they are
4782 // allowed.
4783 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4784 ErrInfo = "Expected immediate, but got non-immediate";
4785 return false;
4786 }
4787 [[fallthrough]];
4788 default:
4789 continue;
4790 }
4791
4792 if (!MO.isReg())
4793 continue;
4794 Register Reg = MO.getReg();
4795 if (!Reg)
4796 continue;
4797
4798 // FIXME: Ideally we would have separate instruction definitions with the
4799 // aligned register constraint.
4800 // FIXME: We do not verify inline asm operands, but custom inline asm
4801 // verification is broken anyway
4802 if (ST.needsAlignedVGPRs()) {
4803 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4804 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4805 if (const TargetRegisterClass *SubRC =
4806 RI.getSubRegisterClass(RC, MO.getSubReg())) {
4807 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4808 if (RC)
4809 RC = SubRC;
4810 }
4811 }
4812
4813 // Check that this is the aligned version of the class.
4814 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4815 ErrInfo = "Subtarget requires even aligned vector registers";
4816 return false;
4817 }
4818 }
4819
4820 if (RegClass != -1) {
4821 if (Reg.isVirtual())
4822 continue;
4823
4824 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4825 if (!RC->contains(Reg)) {
4826 ErrInfo = "Operand has incorrect register class.";
4827 return false;
4828 }
4829 }
4830 }
4831
4832 // Verify SDWA
4833 if (isSDWA(MI)) {
4834 if (!ST.hasSDWA()) {
4835 ErrInfo = "SDWA is not supported on this target";
4836 return false;
4837 }
4838
4839 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4840
4841 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4842 if (OpIdx == -1)
4843 continue;
4844 const MachineOperand &MO = MI.getOperand(OpIdx);
4845
4846 if (!ST.hasSDWAScalar()) {
4847 // Only VGPRS on VI
4848 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4849 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4850 return false;
4851 }
4852 } else {
4853 // No immediates on GFX9
4854 if (!MO.isReg()) {
4855 ErrInfo =
4856 "Only reg allowed as operands in SDWA instructions on GFX9+";
4857 return false;
4858 }
4859 }
4860 }
4861
4862 if (!ST.hasSDWAOmod()) {
4863 // No omod allowed on VI
4864 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4865 if (OMod != nullptr &&
4866 (!OMod->isImm() || OMod->getImm() != 0)) {
4867 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4868 return false;
4869 }
4870 }
4871
4872 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4873 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4874 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4875 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4876 const MachineOperand *Src0ModsMO =
4877 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4878 unsigned Mods = Src0ModsMO->getImm();
4879 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4880 Mods & SISrcMods::SEXT) {
4881 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4882 return false;
4883 }
4884 }
4885
4886 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4887 if (isVOPC(BasicOpcode)) {
4888 if (!ST.hasSDWASdst() && DstIdx != -1) {
4889 // Only vcc allowed as dst on VI for VOPC
4890 const MachineOperand &Dst = MI.getOperand(DstIdx);
4891 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4892 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4893 return false;
4894 }
4895 } else if (!ST.hasSDWAOutModsVOPC()) {
4896 // No clamp allowed on GFX9 for VOPC
4897 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4898 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4899 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4900 return false;
4901 }
4902
4903 // No omod allowed on GFX9 for VOPC
4904 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4905 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4906 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4907 return false;
4908 }
4909 }
4910 }
4911
4912 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4913 if (DstUnused && DstUnused->isImm() &&
4914 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4915 const MachineOperand &Dst = MI.getOperand(DstIdx);
4916 if (!Dst.isReg() || !Dst.isTied()) {
4917 ErrInfo = "Dst register should have tied register";
4918 return false;
4919 }
4920
4921 const MachineOperand &TiedMO =
4922 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4923 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4924 ErrInfo =
4925 "Dst register should be tied to implicit use of preserved register";
4926 return false;
4927 }
4928 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
4929 ErrInfo = "Dst register should use same physical register as preserved";
4930 return false;
4931 }
4932 }
4933 }
4934
4935 // Verify MIMG / VIMAGE / VSAMPLE
4936 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4937 // Ensure that the return type used is large enough for all the options
4938 // being used TFE/LWE require an extra result register.
4939 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4940 if (DMask) {
4941 uint64_t DMaskImm = DMask->getImm();
4942 uint32_t RegCount =
4943 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
4944 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4945 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4946 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4947
4948 // Adjust for packed 16 bit values
4949 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4950 RegCount = divideCeil(RegCount, 2);
4951
4952 // Adjust if using LWE or TFE
4953 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4954 RegCount += 1;
4955
4956 const uint32_t DstIdx =
4957 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4958 const MachineOperand &Dst = MI.getOperand(DstIdx);
4959 if (Dst.isReg()) {
4960 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4961 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4962 if (RegCount > DstSize) {
4963 ErrInfo = "Image instruction returns too many registers for dst "
4964 "register class";
4965 return false;
4966 }
4967 }
4968 }
4969 }
4970
4971 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4972 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
4973 unsigned ConstantBusCount = 0;
4974 bool UsesLiteral = false;
4975 const MachineOperand *LiteralVal = nullptr;
4976
4977 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
4978 if (ImmIdx != -1) {
4979 ++ConstantBusCount;
4980 UsesLiteral = true;
4981 LiteralVal = &MI.getOperand(ImmIdx);
4982 }
4983
4984 SmallVector<Register, 2> SGPRsUsed;
4985 Register SGPRUsed;
4986
4987 // Only look at the true operands. Only a real operand can use the constant
4988 // bus, and we don't want to check pseudo-operands like the source modifier
4989 // flags.
4990 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
4991 if (OpIdx == -1)
4992 continue;
4993 const MachineOperand &MO = MI.getOperand(OpIdx);
4994 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4995 if (MO.isReg()) {
4996 SGPRUsed = MO.getReg();
4997 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
4998 ++ConstantBusCount;
4999 SGPRsUsed.push_back(SGPRUsed);
5000 }
5001 } else if (!MO.isFI()) { // Treat FI like a register.
5002 if (!UsesLiteral) {
5003 ++ConstantBusCount;
5004 UsesLiteral = true;
5005 LiteralVal = &MO;
5006 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5007 assert(isVOP2(MI) || isVOP3(MI));
5008 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5009 return false;
5010 }
5011 }
5012 }
5013 }
5014
5015 SGPRUsed = findImplicitSGPRRead(MI);
5016 if (SGPRUsed) {
5017 // Implicit uses may safely overlap true operands
5018 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5019 return !RI.regsOverlap(SGPRUsed, SGPR);
5020 })) {
5021 ++ConstantBusCount;
5022 SGPRsUsed.push_back(SGPRUsed);
5023 }
5024 }
5025
5026 // v_writelane_b32 is an exception from constant bus restriction:
5027 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5028 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5029 Opcode != AMDGPU::V_WRITELANE_B32) {
5030 ErrInfo = "VOP* instruction violates constant bus restriction";
5031 return false;
5032 }
5033
5034 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5035 ErrInfo = "VOP3 instruction uses literal";
5036 return false;
5037 }
5038 }
5039
5040 // Special case for writelane - this can break the multiple constant bus rule,
5041 // but still can't use more than one SGPR register
5042 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5043 unsigned SGPRCount = 0;
5044 Register SGPRUsed;
5045
5046 for (int OpIdx : {Src0Idx, Src1Idx}) {
5047 if (OpIdx == -1)
5048 break;
5049
5050 const MachineOperand &MO = MI.getOperand(OpIdx);
5051
5052 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5053 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5054 if (MO.getReg() != SGPRUsed)
5055 ++SGPRCount;
5056 SGPRUsed = MO.getReg();
5057 }
5058 }
5059 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5060 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5061 return false;
5062 }
5063 }
5064 }
5065
5066 // Verify misc. restrictions on specific instructions.
5067 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5068 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5069 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5070 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5071 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5072 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5073 if (!compareMachineOp(Src0, Src1) &&
5074 !compareMachineOp(Src0, Src2)) {
5075 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5076 return false;
5077 }
5078 }
5079 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5080 SISrcMods::ABS) ||
5081 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5082 SISrcMods::ABS) ||
5083 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5084 SISrcMods::ABS)) {
5085 ErrInfo = "ABS not allowed in VOP3B instructions";
5086 return false;
5087 }
5088 }
5089
5090 if (isSOP2(MI) || isSOPC(MI)) {
5091 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5092 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5093
5094 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5095 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5096 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5097 !Src0.isIdenticalTo(Src1)) {
5098 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5099 return false;
5100 }
5101 }
5102
5103 if (isSOPK(MI)) {
5104 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5105 if (Desc.isBranch()) {
5106 if (!Op->isMBB()) {
5107 ErrInfo = "invalid branch target for SOPK instruction";
5108 return false;
5109 }
5110 } else {
5111 uint64_t Imm = Op->getImm();
5112 if (sopkIsZext(Opcode)) {
5113 if (!isUInt<16>(Imm)) {
5114 ErrInfo = "invalid immediate for SOPK instruction";
5115 return false;
5116 }
5117 } else {
5118 if (!isInt<16>(Imm)) {
5119 ErrInfo = "invalid immediate for SOPK instruction";
5120 return false;
5121 }
5122 }
5123 }
5124 }
5125
5126 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5127 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5128 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5129 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5130 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5131 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5132
5133 const unsigned StaticNumOps =
5134 Desc.getNumOperands() + Desc.implicit_uses().size();
5135 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5136
5137 // Allow additional implicit operands. This allows a fixup done by the post
5138 // RA scheduler where the main implicit operand is killed and implicit-defs
5139 // are added for sub-registers that remain live after this instruction.
5140 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5141 ErrInfo = "missing implicit register operands";
5142 return false;
5143 }
5144
5145 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5146 if (IsDst) {
5147 if (!Dst->isUse()) {
5148 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5149 return false;
5150 }
5151
5152 unsigned UseOpIdx;
5153 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5154 UseOpIdx != StaticNumOps + 1) {
5155 ErrInfo = "movrel implicit operands should be tied";
5156 return false;
5157 }
5158 }
5159
5160 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5161 const MachineOperand &ImpUse
5162 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5163 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5164 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5165 ErrInfo = "src0 should be subreg of implicit vector use";
5166 return false;
5167 }
5168 }
5169
5170 // Make sure we aren't losing exec uses in the td files. This mostly requires
5171 // being careful when using let Uses to try to add other use registers.
5172 if (shouldReadExec(MI)) {
5173 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5174 ErrInfo = "VALU instruction does not implicitly read exec mask";
5175 return false;
5176 }
5177 }
5178
5179 if (isSMRD(MI)) {
5180 if (MI.mayStore() &&
5182 // The register offset form of scalar stores may only use m0 as the
5183 // soffset register.
5184 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5185 if (Soff && Soff->getReg() != AMDGPU::M0) {
5186 ErrInfo = "scalar stores must use m0 as offset register";
5187 return false;
5188 }
5189 }
5190 }
5191
5192 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5193 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5194 if (Offset->getImm() != 0) {
5195 ErrInfo = "subtarget does not support offsets in flat instructions";
5196 return false;
5197 }
5198 }
5199
5200 if (isDS(MI) && !ST.hasGDS()) {
5201 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5202 if (GDSOp && GDSOp->getImm() != 0) {
5203 ErrInfo = "GDS is not supported on this subtarget";
5204 return false;
5205 }
5206 }
5207
5208 if (isImage(MI)) {
5209 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5210 if (DimOp) {
5211 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5212 AMDGPU::OpName::vaddr0);
5213 int RSrcOpName =
5214 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5215 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5216 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5217 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5219 const AMDGPU::MIMGDimInfo *Dim =
5221
5222 if (!Dim) {
5223 ErrInfo = "dim is out of range";
5224 return false;
5225 }
5226
5227 bool IsA16 = false;
5228 if (ST.hasR128A16()) {
5229 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5230 IsA16 = R128A16->getImm() != 0;
5231 } else if (ST.hasA16()) {
5232 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5233 IsA16 = A16->getImm() != 0;
5234 }
5235
5236 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5237
5238 unsigned AddrWords =
5239 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5240
5241 unsigned VAddrWords;
5242 if (IsNSA) {
5243 VAddrWords = RsrcIdx - VAddr0Idx;
5244 if (ST.hasPartialNSAEncoding() &&
5245 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5246 unsigned LastVAddrIdx = RsrcIdx - 1;
5247 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5248 }
5249 } else {
5250 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5251 if (AddrWords > 12)
5252 AddrWords = 16;
5253 }
5254
5255 if (VAddrWords != AddrWords) {
5256 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5257 << " but got " << VAddrWords << "\n");
5258 ErrInfo = "bad vaddr size";
5259 return false;
5260 }
5261 }
5262 }
5263
5264 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5265 if (DppCt) {
5266 using namespace AMDGPU::DPP;
5267
5268 unsigned DC = DppCt->getImm();
5269 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5270 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5271 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5272 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5273 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5274 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5275 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5276 ErrInfo = "Invalid dpp_ctrl value";
5277 return false;
5278 }
5279 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5281 ErrInfo = "Invalid dpp_ctrl value: "
5282 "wavefront shifts are not supported on GFX10+";
5283 return false;
5284 }
5285 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5287 ErrInfo = "Invalid dpp_ctrl value: "
5288 "broadcasts are not supported on GFX10+";
5289 return false;
5290 }
5291 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5293 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5294 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5295 !ST.hasGFX90AInsts()) {
5296 ErrInfo = "Invalid dpp_ctrl value: "
5297 "row_newbroadcast/row_share is not supported before "
5298 "GFX90A/GFX10";
5299 return false;
5300 }
5301 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5302 ErrInfo = "Invalid dpp_ctrl value: "
5303 "row_share and row_xmask are not supported before GFX10";
5304 return false;
5305 }
5306 }
5307
5308 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5310 ErrInfo = "Invalid dpp_ctrl value: "
5311 "DP ALU dpp only support row_newbcast";
5312 return false;
5313 }
5314 }
5315
5316 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5317 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5318 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5319 : AMDGPU::OpName::vdata;
5320 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5321 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5322 if (Data && !Data->isReg())
5323 Data = nullptr;
5324
5325 if (ST.hasGFX90AInsts()) {
5326 if (Dst && Data &&
5327 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5328 ErrInfo = "Invalid register class: "
5329 "vdata and vdst should be both VGPR or AGPR";
5330 return false;
5331 }
5332 if (Data && Data2 &&
5333 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5334 ErrInfo = "Invalid register class: "
5335 "both data operands should be VGPR or AGPR";
5336 return false;
5337 }
5338 } else {
5339 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5340 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5341 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5342 ErrInfo = "Invalid register class: "
5343 "agpr loads and stores not supported on this GPU";
5344 return false;
5345 }
5346 }
5347 }
5348
5349 if (ST.needsAlignedVGPRs()) {
5350 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5352 if (!Op)
5353 return true;
5354 Register Reg = Op->getReg();
5355 if (Reg.isPhysical())
5356 return !(RI.getHWRegIndex(Reg) & 1);
5357 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5358 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5359 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5360 };
5361
5362 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5363 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5364 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5365
5366 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5367 ErrInfo = "Subtarget requires even aligned vector registers "
5368 "for DS_GWS instructions";
5369 return false;
5370 }
5371 }
5372
5373 if (isMIMG(MI)) {
5374 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5375 ErrInfo = "Subtarget requires even aligned vector registers "
5376 "for vaddr operand of image instructions";
5377 return false;
5378 }
5379 }
5380 }
5381
5382 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5383 !ST.hasGFX90AInsts()) {
5384 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5385 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5386 ErrInfo = "Invalid register class: "
5387 "v_accvgpr_write with an SGPR is not supported on this GPU";
5388 return false;
5389 }
5390 }
5391
5392 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5393 const MachineOperand &SrcOp = MI.getOperand(1);
5394 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5395 ErrInfo = "pseudo expects only physical SGPRs";
5396 return false;
5397 }
5398 }
5399
5400 return true;
5401}
5402
5403// It is more readable to list mapped opcodes on the same line.
5404// clang-format off
5405
5407 switch (MI.getOpcode()) {
5408 default: return AMDGPU::INSTRUCTION_LIST_END;
5409 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5410 case AMDGPU::COPY: return AMDGPU::COPY;
5411 case AMDGPU::PHI: return AMDGPU::PHI;
5412 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5413 case AMDGPU::WQM: return AMDGPU::WQM;
5414 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5415 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5416 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5417 case AMDGPU::S_MOV_B32: {
5418 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5419 return MI.getOperand(1).isReg() ||
5420 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5421 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5422 }
5423 case AMDGPU::S_ADD_I32:
5424 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5425 case AMDGPU::S_ADDC_U32:
5426 return AMDGPU::V_ADDC_U32_e32;
5427 case AMDGPU::S_SUB_I32:
5428 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5429 // FIXME: These are not consistently handled, and selected when the carry is
5430 // used.
5431 case AMDGPU::S_ADD_U32:
5432 return AMDGPU::V_ADD_CO_U32_e32;
5433 case AMDGPU::S_SUB_U32:
5434 return AMDGPU::V_SUB_CO_U32_e32;
5435 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5436 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5437 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5438 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5439 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5440 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5441 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5442 case AMDGPU::S_XNOR_B32:
5443 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5444 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5445 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5446 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5447 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5448 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5449 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5450 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5451 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5452 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5453 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5454 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5455 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5456 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5457 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5458 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5459 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5460 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5461 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5462 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5463 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5464 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5465 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5466 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5467 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5468 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5469 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5470 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5471 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5472 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5473 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5474 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5475 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5476 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5477 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5478 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5479 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5480 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5481 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5482 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5483 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5484 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5485 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5486 case AMDGPU::S_CVT_F32_F16:
5487 case AMDGPU::S_CVT_HI_F32_F16:
5488 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5489 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5490 case AMDGPU::S_CVT_F16_F32:
5491 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5492 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5493 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5494 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5495 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5496 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5497 case AMDGPU::S_CEIL_F16:
5498 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5499 : AMDGPU::V_CEIL_F16_fake16_e64;
5500 case AMDGPU::S_FLOOR_F16:
5501 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5502 : AMDGPU::V_FLOOR_F16_fake16_e64;
5503 case AMDGPU::S_TRUNC_F16:
5504 return AMDGPU::V_TRUNC_F16_fake16_e64;
5505 case AMDGPU::S_RNDNE_F16:
5506 return AMDGPU::V_RNDNE_F16_fake16_e64;
5507 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5508 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5509 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5510 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5511 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5512 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5513 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5514 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5515 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5516 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5517 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5518 case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5519 case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5520 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5521 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5522 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5523 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5524 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5525 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5526 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5527 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5528 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5529 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5530 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5531 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5532 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5533 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5534 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5535 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5536 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5537 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5538 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5539 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5540 case AMDGPU::S_CMP_LT_F16:
5541 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5542 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5543 case AMDGPU::S_CMP_EQ_F16:
5544 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5545 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5546 case AMDGPU::S_CMP_LE_F16:
5547 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5548 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5549 case AMDGPU::S_CMP_GT_F16:
5550 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5551 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5552 case AMDGPU::S_CMP_LG_F16:
5553 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5554 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5555 case AMDGPU::S_CMP_GE_F16:
5556 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5557 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5558 case AMDGPU::S_CMP_O_F16:
5559 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5560 : AMDGPU::V_CMP_O_F16_fake16_e64;
5561 case AMDGPU::S_CMP_U_F16:
5562 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5563 : AMDGPU::V_CMP_U_F16_fake16_e64;
5564 case AMDGPU::S_CMP_NGE_F16:
5565 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5566 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5567 case AMDGPU::S_CMP_NLG_F16:
5568 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5569 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5570 case AMDGPU::S_CMP_NGT_F16:
5571 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5572 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5573 case AMDGPU::S_CMP_NLE_F16:
5574 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5575 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5576 case AMDGPU::S_CMP_NEQ_F16:
5577 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5578 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5579 case AMDGPU::S_CMP_NLT_F16:
5580 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5581 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5582 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5583 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5584 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5585 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5586 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5587 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5588 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5589 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5590 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5591 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5592 }
5594 "Unexpected scalar opcode without corresponding vector one!");
5595}
5596
5597// clang-format on
5598
5602 const DebugLoc &DL, Register Reg,
5603 bool IsSCCLive,
5604 SlotIndexes *Indexes) const {
5605 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5606 const SIInstrInfo *TII = ST.getInstrInfo();
5607 bool IsWave32 = ST.isWave32();
5608 if (IsSCCLive) {
5609 // Insert two move instructions, one to save the original value of EXEC and
5610 // the other to turn on all bits in EXEC. This is required as we can't use
5611 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5612 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5613 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5614 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5615 .addReg(Exec, RegState::Kill);
5616 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5617 if (Indexes) {
5618 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5619 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5620 }
5621 } else {
5622 const unsigned OrSaveExec =
5623 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5624 auto SaveExec =
5625 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5626 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5627 if (Indexes)
5628 Indexes->insertMachineInstrInMaps(*SaveExec);
5629 }
5630}
5631
5634 const DebugLoc &DL, Register Reg,
5635 SlotIndexes *Indexes) const {
5636 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5637 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5638 auto ExecRestoreMI =
5639 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5640 if (Indexes)
5641 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5642}
5643
5644static const TargetRegisterClass *
5646 const MachineRegisterInfo &MRI,
5647 const MCInstrDesc &TID, unsigned RCID,
5648 bool IsAllocatable) {
5649 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5650 (((TID.mayLoad() || TID.mayStore()) &&
5651 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5653 switch (RCID) {
5654 case AMDGPU::AV_32RegClassID:
5655 RCID = AMDGPU::VGPR_32RegClassID;
5656 break;
5657 case AMDGPU::AV_64RegClassID:
5658 RCID = AMDGPU::VReg_64RegClassID;
5659 break;
5660 case AMDGPU::AV_96RegClassID:
5661 RCID = AMDGPU::VReg_96RegClassID;
5662 break;
5663 case AMDGPU::AV_128RegClassID:
5664 RCID = AMDGPU::VReg_128RegClassID;
5665 break;
5666 case AMDGPU::AV_160RegClassID:
5667 RCID = AMDGPU::VReg_160RegClassID;
5668 break;
5669 case AMDGPU::AV_512RegClassID:
5670 RCID = AMDGPU::VReg_512RegClassID;
5671 break;
5672 default:
5673 break;
5674 }
5675 }
5676
5677 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5678}
5679
5681 unsigned OpNum, const TargetRegisterInfo *TRI,
5682 const MachineFunction &MF)
5683 const {
5684 if (OpNum >= TID.getNumOperands())
5685 return nullptr;
5686 auto RegClass = TID.operands()[OpNum].RegClass;
5687 bool IsAllocatable = false;
5689 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5690 // with two data operands. Request register class constrained to VGPR only
5691 // of both operands present as Machine Copy Propagation can not check this
5692 // constraint and possibly other passes too.
5693 //
5694 // The check is limited to FLAT and DS because atomics in non-flat encoding
5695 // have their vdst and vdata tied to be the same register.
5696 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5697 AMDGPU::OpName::vdst);
5698 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5699 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5700 : AMDGPU::OpName::vdata);
5701 if (DataIdx != -1) {
5702 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5703 TID.Opcode, AMDGPU::OpName::data1);
5704 }
5705 }
5706 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5707 IsAllocatable);
5708}
5709
5711 unsigned OpNo) const {
5712 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5713 const MCInstrDesc &Desc = get(MI.getOpcode());
5714 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5715 Desc.operands()[OpNo].RegClass == -1) {
5716 Register Reg = MI.getOperand(OpNo).getReg();
5717
5718 if (Reg.isVirtual())
5719 return MRI.getRegClass(Reg);
5720 return RI.getPhysRegBaseClass(Reg);
5721 }
5722
5723 unsigned RCID = Desc.operands()[OpNo].RegClass;
5724 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5725}
5726
5729 MachineBasicBlock *MBB = MI.getParent();
5730 MachineOperand &MO = MI.getOperand(OpIdx);
5732 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5733 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5734 unsigned Size = RI.getRegSizeInBits(*RC);
5735 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
5736 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
5737 : AMDGPU::V_MOV_B32_e32;
5738 if (MO.isReg())
5739 Opcode = AMDGPU::COPY;
5740 else if (RI.isSGPRClass(RC))
5741 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5742
5743 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5744 Register Reg = MRI.createVirtualRegister(VRC);
5746 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5747 MO.ChangeToRegister(Reg, false);
5748}
5749
5752 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5753 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5754 if (!SuperReg.getReg().isVirtual())
5755 return RI.getSubReg(SuperReg.getReg(), SubIdx);
5756
5757 MachineBasicBlock *MBB = MI->getParent();
5758 DebugLoc DL = MI->getDebugLoc();
5759 Register SubReg = MRI.createVirtualRegister(SubRC);
5760
5761 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
5762 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5763 .addReg(SuperReg.getReg(), 0, NewSubIdx);
5764 return SubReg;
5765}
5766
5769 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5770 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5771 if (Op.isImm()) {
5772 if (SubIdx == AMDGPU::sub0)
5773 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5774 if (SubIdx == AMDGPU::sub1)
5775 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5776
5777 llvm_unreachable("Unhandled register index for immediate");
5778 }
5779
5780 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5781 SubIdx, SubRC);
5782 return MachineOperand::CreateReg(SubReg, false);
5783}
5784
5785// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5786void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5787 assert(Inst.getNumExplicitOperands() == 3);
5788 MachineOperand Op1 = Inst.getOperand(1);
5789 Inst.removeOperand(1);
5790 Inst.addOperand(Op1);
5791}
5792
5794 const MCOperandInfo &OpInfo,
5795 const MachineOperand &MO) const {
5796 if (!MO.isReg())
5797 return false;
5798
5799 Register Reg = MO.getReg();
5800
5801 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5802 if (Reg.isPhysical())
5803 return DRC->contains(Reg);
5804
5805 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5806
5807 if (MO.getSubReg()) {
5808 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5809 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5810 if (!SuperRC)
5811 return false;
5812
5813 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5814 if (!DRC)
5815 return false;
5816 }
5817 return RC->hasSuperClassEq(DRC);
5818}
5819
5821 const MCOperandInfo &OpInfo,
5822 const MachineOperand &MO) const {
5823 if (MO.isReg())
5824 return isLegalRegOperand(MRI, OpInfo, MO);
5825
5826 // Handle non-register types that are treated like immediates.
5827 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5828 return true;
5829}
5830
5831bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5832 const MachineOperand *MO) const {
5833 const MachineFunction &MF = *MI.getParent()->getParent();
5834 const MachineRegisterInfo &MRI = MF.getRegInfo();
5835 const MCInstrDesc &InstDesc = MI.getDesc();
5836 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5837 const TargetRegisterClass *DefinedRC =
5838 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5839 if (!MO)
5840 MO = &MI.getOperand(OpIdx);
5841
5842 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5843 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5844 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5845 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5846 return false;
5847
5849 if (MO->isReg())
5850 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5851
5852 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5853 if (i == OpIdx)
5854 continue;
5855 const MachineOperand &Op = MI.getOperand(i);
5856 if (Op.isReg()) {
5857 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5858 if (!SGPRsUsed.count(SGPR) &&
5859 // FIXME: This can access off the end of the operands() array.
5860 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5861 if (--ConstantBusLimit <= 0)
5862 return false;
5863 SGPRsUsed.insert(SGPR);
5864 }
5865 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5866 !isInlineConstant(Op, InstDesc.operands()[i])) {
5867 if (!LiteralLimit--)
5868 return false;
5869 if (--ConstantBusLimit <= 0)
5870 return false;
5871 }
5872 }
5873 } else if (ST.hasNoF16PseudoScalarTransInlineConstants() && !MO->isReg() &&
5874 isF16PseudoScalarTrans(MI.getOpcode()) &&
5875 isInlineConstant(*MO, OpInfo)) {
5876 return false;
5877 }
5878
5879 if (MO->isReg()) {
5880 if (!DefinedRC)
5881 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5882 if (!isLegalRegOperand(MRI, OpInfo, *MO))
5883 return false;
5884 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5885 if (IsAGPR && !ST.hasMAIInsts())
5886 return false;
5887 unsigned Opc = MI.getOpcode();
5888 if (IsAGPR &&
5889 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5890 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5891 return false;
5892 // Atomics should have both vdst and vdata either vgpr or agpr.
5893 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5894 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
5895 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5896 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5897 MI.getOperand(DataIdx).isReg() &&
5898 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5899 return false;
5900 if ((int)OpIdx == DataIdx) {
5901 if (VDstIdx != -1 &&
5902 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5903 return false;
5904 // DS instructions with 2 src operands also must have tied RC.
5905 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5906 AMDGPU::OpName::data1);
5907 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5908 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5909 return false;
5910 }
5911 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5912 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5913 RI.isSGPRReg(MRI, MO->getReg()))
5914 return false;
5915 return true;
5916 }
5917
5918 if (MO->isImm()) {
5919 uint64_t Imm = MO->getImm();
5920 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5921 bool Is64BitOp = Is64BitFPOp ||
5925 if (Is64BitOp &&
5927 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
5928 return false;
5929
5930 // FIXME: We can use sign extended 64-bit literals, but only for signed
5931 // operands. At the moment we do not know if an operand is signed.
5932 // Such operand will be encoded as its low 32 bits and then either
5933 // correctly sign extended or incorrectly zero extended by HW.
5934 if (!Is64BitFPOp && (int32_t)Imm < 0)
5935 return false;
5936 }
5937 }
5938
5939 // Handle non-register types that are treated like immediates.
5940 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
5941
5942 if (!DefinedRC) {
5943 // This operand expects an immediate.
5944 return true;
5945 }
5946
5947 return isImmOperandLegal(MI, OpIdx, *MO);
5948}
5949
5951 MachineInstr &MI) const {
5952 unsigned Opc = MI.getOpcode();
5953 const MCInstrDesc &InstrDesc = get(Opc);
5954
5955 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5956 MachineOperand &Src0 = MI.getOperand(Src0Idx);
5957
5958 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5959 MachineOperand &Src1 = MI.getOperand(Src1Idx);
5960
5961 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5962 // we need to only have one constant bus use before GFX10.
5963 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
5964 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
5965 RI.isSGPRReg(MRI, Src0.getReg()))
5966 legalizeOpWithMove(MI, Src0Idx);
5967
5968 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5969 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5970 // src0/src1 with V_READFIRSTLANE.
5971 if (Opc == AMDGPU::V_WRITELANE_B32) {
5972 const DebugLoc &DL = MI.getDebugLoc();
5973 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
5974 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5975 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5976 .add(Src0);
5977 Src0.ChangeToRegister(Reg, false);
5978 }
5979 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
5980 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5981 const DebugLoc &DL = MI.getDebugLoc();
5982 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5983 .add(Src1);
5984 Src1.ChangeToRegister(Reg, false);
5985 }
5986 return;
5987 }
5988
5989 // No VOP2 instructions support AGPRs.
5990 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
5991 legalizeOpWithMove(MI, Src0Idx);
5992
5993 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
5994 legalizeOpWithMove(MI, Src1Idx);
5995
5996 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
5997 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
5998 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5999 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6000 legalizeOpWithMove(MI, Src2Idx);
6001 }
6002
6003 // VOP2 src0 instructions support all operand types, so we don't need to check
6004 // their legality. If src1 is already legal, we don't need to do anything.
6005 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6006 return;
6007
6008 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6009 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6010 // select is uniform.
6011 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6012 RI.isVGPR(MRI, Src1.getReg())) {
6013 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6014 const DebugLoc &DL = MI.getDebugLoc();
6015 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6016 .add(Src1);
6017 Src1.ChangeToRegister(Reg, false);
6018 return;
6019 }
6020
6021 // We do not use commuteInstruction here because it is too aggressive and will
6022 // commute if it is possible. We only want to commute here if it improves
6023 // legality. This can be called a fairly large number of times so don't waste
6024 // compile time pointlessly swapping and checking legality again.
6025 if (HasImplicitSGPR || !MI.isCommutable()) {
6026 legalizeOpWithMove(MI, Src1Idx);
6027 return;
6028 }
6029
6030 // If src0 can be used as src1, commuting will make the operands legal.
6031 // Otherwise we have to give up and insert a move.
6032 //
6033 // TODO: Other immediate-like operand kinds could be commuted if there was a
6034 // MachineOperand::ChangeTo* for them.
6035 if ((!Src1.isImm() && !Src1.isReg()) ||
6036 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6037 legalizeOpWithMove(MI, Src1Idx);
6038 return;
6039 }
6040
6041 int CommutedOpc = commuteOpcode(MI);
6042 if (CommutedOpc == -1) {
6043 legalizeOpWithMove(MI, Src1Idx);
6044 return;
6045 }
6046
6047 MI.setDesc(get(CommutedOpc));
6048
6049 Register Src0Reg = Src0.getReg();
6050 unsigned Src0SubReg = Src0.getSubReg();
6051 bool Src0Kill = Src0.isKill();
6052
6053 if (Src1.isImm())
6054 Src0.ChangeToImmediate(Src1.getImm());
6055 else if (Src1.isReg()) {
6056 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6057 Src0.setSubReg(Src1.getSubReg());
6058 } else
6059 llvm_unreachable("Should only have register or immediate operands");
6060
6061 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6062 Src1.setSubReg(Src0SubReg);
6064}
6065
6066// Legalize VOP3 operands. All operand types are supported for any operand
6067// but only one literal constant and only starting from GFX10.
6069 MachineInstr &MI) const {
6070 unsigned Opc = MI.getOpcode();
6071
6072 int VOP3Idx[3] = {
6073 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6074 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6075 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6076 };
6077
6078 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6079 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
6080 // src1 and src2 must be scalar
6081 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6082 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6083 const DebugLoc &DL = MI.getDebugLoc();
6084 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6085 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6086 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6087 .add(Src1);
6088 Src1.ChangeToRegister(Reg, false);
6089 }
6090 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6091 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6092 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6093 .add(Src2);
6094 Src2.ChangeToRegister(Reg, false);
6095 }
6096 }
6097
6098 // Find the one SGPR operand we are allowed to use.
6099 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6100 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6101 SmallDenseSet<unsigned> SGPRsUsed;
6102 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6103 if (SGPRReg) {
6104 SGPRsUsed.insert(SGPRReg);
6105 --ConstantBusLimit;
6106 }
6107
6108 for (int Idx : VOP3Idx) {
6109 if (Idx == -1)
6110 break;
6111 MachineOperand &MO = MI.getOperand(Idx);
6112
6113 if (!MO.isReg()) {
6114 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6115 continue;
6116
6117 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6118 --LiteralLimit;
6119 --ConstantBusLimit;
6120 continue;
6121 }
6122
6123 --LiteralLimit;
6124 --ConstantBusLimit;
6126 continue;
6127 }
6128
6129 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6130 !isOperandLegal(MI, Idx, &MO)) {
6132 continue;
6133 }
6134
6135 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6136 continue; // VGPRs are legal
6137
6138 // We can use one SGPR in each VOP3 instruction prior to GFX10
6139 // and two starting from GFX10.
6140 if (SGPRsUsed.count(MO.getReg()))
6141 continue;
6142 if (ConstantBusLimit > 0) {
6143 SGPRsUsed.insert(MO.getReg());
6144 --ConstantBusLimit;
6145 continue;
6146 }
6147
6148 // If we make it this far, then the operand is not legal and we must
6149 // legalize it.
6151 }
6152
6153 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6154 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6155 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6156 legalizeOpWithMove(MI, VOP3Idx[2]);
6157}
6158
6161 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6162 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6163 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6164 if (DstRC)
6165 SRC = RI.getCommonSubClass(SRC, DstRC);
6166
6167 Register DstReg = MRI.createVirtualRegister(SRC);
6168 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6169
6170 if (RI.hasAGPRs(VRC)) {
6171 VRC = RI.getEquivalentVGPRClass(VRC);
6172 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6173 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6174 get(TargetOpcode::COPY), NewSrcReg)
6175 .addReg(SrcReg);
6176 SrcReg = NewSrcReg;
6177 }
6178
6179 if (SubRegs == 1) {
6180 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6181 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6182 .addReg(SrcReg);
6183 return DstReg;
6184 }
6185
6187 for (unsigned i = 0; i < SubRegs; ++i) {
6188 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6189 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6190 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6191 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6192 SRegs.push_back(SGPR);
6193 }
6194
6196 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6197 get(AMDGPU::REG_SEQUENCE), DstReg);
6198 for (unsigned i = 0; i < SubRegs; ++i) {
6199 MIB.addReg(SRegs[i]);
6200 MIB.addImm(RI.getSubRegFromChannel(i));
6201 }
6202 return DstReg;
6203}
6204
6206 MachineInstr &MI) const {
6207
6208 // If the pointer is store in VGPRs, then we need to move them to
6209 // SGPRs using v_readfirstlane. This is safe because we only select
6210 // loads with uniform pointers to SMRD instruction so we know the
6211 // pointer value is uniform.
6212 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6213 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6214 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6215 SBase->setReg(SGPR);
6216 }
6217 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6218 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6219 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6220 SOff->setReg(SGPR);
6221 }
6222}
6223
6225 unsigned Opc = Inst.getOpcode();
6226 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6227 if (OldSAddrIdx < 0)
6228 return false;
6229
6231
6232 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6233 if (NewOpc < 0)
6235 if (NewOpc < 0)
6236 return false;
6237
6239 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6240 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6241 return false;
6242
6243 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6244 if (NewVAddrIdx < 0)
6245 return false;
6246
6247 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6248
6249 // Check vaddr, it shall be zero or absent.
6250 MachineInstr *VAddrDef = nullptr;
6251 if (OldVAddrIdx >= 0) {
6252 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6253 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6254 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6255 !VAddrDef->getOperand(1).isImm() ||
6256 VAddrDef->getOperand(1).getImm() != 0)
6257 return false;
6258 }
6259
6260 const MCInstrDesc &NewDesc = get(NewOpc);
6261 Inst.setDesc(NewDesc);
6262
6263 // Callers expect iterator to be valid after this call, so modify the
6264 // instruction in place.
6265 if (OldVAddrIdx == NewVAddrIdx) {
6266 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6267 // Clear use list from the old vaddr holding a zero register.
6268 MRI.removeRegOperandFromUseList(&NewVAddr);
6269 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6270 Inst.removeOperand(OldSAddrIdx);
6271 // Update the use list with the pointer we have just moved from vaddr to
6272 // saddr position. Otherwise new vaddr will be missing from the use list.
6273 MRI.removeRegOperandFromUseList(&NewVAddr);
6274 MRI.addRegOperandToUseList(&NewVAddr);
6275 } else {
6276 assert(OldSAddrIdx == NewVAddrIdx);
6277
6278 if (OldVAddrIdx >= 0) {
6279 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6280 AMDGPU::OpName::vdst_in);
6281
6282 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6283 // it asserts. Untie the operands for now and retie them afterwards.
6284 if (NewVDstIn != -1) {
6285 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6286 Inst.untieRegOperand(OldVDstIn);
6287 }
6288
6289 Inst.removeOperand(OldVAddrIdx);
6290
6291 if (NewVDstIn != -1) {
6292 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6293 Inst.tieOperands(NewVDst, NewVDstIn);
6294 }
6295 }
6296 }
6297
6298 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6299 VAddrDef->eraseFromParent();
6300
6301 return true;
6302}
6303
6304// FIXME: Remove this when SelectionDAG is obsoleted.
6306 MachineInstr &MI) const {
6308 return;
6309
6310 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6311 // thinks they are uniform, so a readfirstlane should be valid.
6312 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6313 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6314 return;
6315
6317 return;
6318
6319 const TargetRegisterClass *DeclaredRC = getRegClass(
6320 MI.getDesc(), SAddr->getOperandNo(), &RI, *MI.getParent()->getParent());
6321
6322 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6323 SAddr->setReg(ToSGPR);
6324}
6325
6328 const TargetRegisterClass *DstRC,
6331 const DebugLoc &DL) const {
6332 Register OpReg = Op.getReg();
6333 unsigned OpSubReg = Op.getSubReg();
6334
6335 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6336 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6337
6338 // Check if operand is already the correct register class.
6339 if (DstRC == OpRC)
6340 return;
6341
6342 Register DstReg = MRI.createVirtualRegister(DstRC);
6343 auto Copy =
6344 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6345 Op.setReg(DstReg);
6346
6347 MachineInstr *Def = MRI.getVRegDef(OpReg);
6348 if (!Def)
6349 return;
6350
6351 // Try to eliminate the copy if it is copying an immediate value.
6352 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6353 foldImmediate(*Copy, *Def, OpReg, &MRI);
6354
6355 bool ImpDef = Def->isImplicitDef();
6356 while (!ImpDef && Def && Def->isCopy()) {
6357 if (Def->getOperand(1).getReg().isPhysical())
6358 break;
6359 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6360 ImpDef = Def && Def->isImplicitDef();
6361 }
6362 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6363 !ImpDef)
6364 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6365}
6366
6367// Emit the actual waterfall loop, executing the wrapped instruction for each
6368// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6369// iteration, in the worst case we execute 64 (once per lane).
6370static void
6373 MachineBasicBlock &LoopBB,
6374 MachineBasicBlock &BodyBB,
6375 const DebugLoc &DL,
6376 ArrayRef<MachineOperand *> ScalarOps) {
6377 MachineFunction &MF = *LoopBB.getParent();
6378 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6379 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6380 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6381 unsigned SaveExecOpc =
6382 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6383 unsigned XorTermOpc =
6384 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6385 unsigned AndOpc =
6386 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6387 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6388
6390 Register CondReg;
6391
6392 for (MachineOperand *ScalarOp : ScalarOps) {
6393 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6394 unsigned NumSubRegs = RegSize / 32;
6395 Register VScalarOp = ScalarOp->getReg();
6396
6397 if (NumSubRegs == 1) {
6398 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6399
6400 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6401 .addReg(VScalarOp);
6402
6403 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6404
6405 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6406 .addReg(CurReg)
6407 .addReg(VScalarOp);
6408
6409 // Combine the comparison results with AND.
6410 if (!CondReg) // First.
6411 CondReg = NewCondReg;
6412 else { // If not the first, we create an AND.
6413 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6414 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6415 .addReg(CondReg)
6416 .addReg(NewCondReg);
6417 CondReg = AndReg;
6418 }
6419
6420 // Update ScalarOp operand to use the SGPR ScalarOp.
6421 ScalarOp->setReg(CurReg);
6422 ScalarOp->setIsKill();
6423 } else {
6424 SmallVector<Register, 8> ReadlanePieces;
6425 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6426 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6427 "Unhandled register size");
6428
6429 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6430 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6431 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6432
6433 // Read the next variant <- also loop target.
6434 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6435 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6436
6437 // Read the next variant <- also loop target.
6438 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6439 .addReg(VScalarOp, VScalarOpUndef,
6440 TRI->getSubRegFromChannel(Idx + 1));
6441
6442 ReadlanePieces.push_back(CurRegLo);
6443 ReadlanePieces.push_back(CurRegHi);
6444
6445 // Comparison is to be done as 64-bit.
6446 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6447 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6448 .addReg(CurRegLo)
6449 .addImm(AMDGPU::sub0)
6450 .addReg(CurRegHi)
6451 .addImm(AMDGPU::sub1);
6452
6453 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6454 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6455 NewCondReg)
6456 .addReg(CurReg);
6457 if (NumSubRegs <= 2)
6458 Cmp.addReg(VScalarOp);
6459 else
6460 Cmp.addReg(VScalarOp, VScalarOpUndef,
6461 TRI->getSubRegFromChannel(Idx, 2));
6462
6463 // Combine the comparison results with AND.
6464 if (!CondReg) // First.
6465 CondReg = NewCondReg;
6466 else { // If not the first, we create an AND.
6467 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6468 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6469 .addReg(CondReg)
6470 .addReg(NewCondReg);
6471 CondReg = AndReg;
6472 }
6473 } // End for loop.
6474
6475 const auto *SScalarOpRC =
6476 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6477 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6478
6479 // Build scalar ScalarOp.
6480 auto Merge =
6481 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6482 unsigned Channel = 0;
6483 for (Register Piece : ReadlanePieces) {
6484 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6485 }
6486
6487 // Update ScalarOp operand to use the SGPR ScalarOp.
6488 ScalarOp->setReg(SScalarOp);
6489 ScalarOp->setIsKill();
6490 }
6491 }
6492
6493 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6494 MRI.setSimpleHint(SaveExec, CondReg);
6495
6496 // Update EXEC to matching lanes, saving original to SaveExec.
6497 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6498 .addReg(CondReg, RegState::Kill);
6499
6500 // The original instruction is here; we insert the terminators after it.
6501 I = BodyBB.end();
6502
6503 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6504 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6505 .addReg(Exec)
6506 .addReg(SaveExec);
6507
6508 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6509}
6510
6511// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6512// with SGPRs by iterating over all unique values across all lanes.
6513// Returns the loop basic block that now contains \p MI.
6514static MachineBasicBlock *
6518 MachineBasicBlock::iterator Begin = nullptr,
6519 MachineBasicBlock::iterator End = nullptr) {
6520 MachineBasicBlock &MBB = *MI.getParent();
6521 MachineFunction &MF = *MBB.getParent();
6522 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6523 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6525 if (!Begin.isValid())
6526 Begin = &MI;
6527 if (!End.isValid()) {
6528 End = &MI;
6529 ++End;
6530 }
6531 const DebugLoc &DL = MI.getDebugLoc();
6532 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6533 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6534 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6535
6536 // Save SCC. Waterfall Loop may overwrite SCC.
6537 Register SaveSCCReg;
6538
6539 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6540 // rather than unlimited scan everywhere
6541 bool SCCNotDead =
6542 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6543 std::numeric_limits<unsigned>::max()) !=
6545 if (SCCNotDead) {
6546 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6547 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6548 .addImm(1)
6549 .addImm(0);
6550 }
6551
6552 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6553
6554 // Save the EXEC mask
6555 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6556
6557 // Killed uses in the instruction we are waterfalling around will be
6558 // incorrect due to the added control-flow.
6560 ++AfterMI;
6561 for (auto I = Begin; I != AfterMI; I++) {
6562 for (auto &MO : I->all_uses())
6563 MRI.clearKillFlags(MO.getReg());
6564 }
6565
6566 // To insert the loop we need to split the block. Move everything after this
6567 // point to a new block, and insert a new empty block between the two.
6570 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6572 ++MBBI;
6573
6574 MF.insert(MBBI, LoopBB);
6575 MF.insert(MBBI, BodyBB);
6576 MF.insert(MBBI, RemainderBB);
6577
6578 LoopBB->addSuccessor(BodyBB);
6579 BodyBB->addSuccessor(LoopBB);
6580 BodyBB->addSuccessor(RemainderBB);
6581
6582 // Move Begin to MI to the BodyBB, and the remainder of the block to
6583 // RemainderBB.
6584 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6585 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6586 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6587
6588 MBB.addSuccessor(LoopBB);
6589
6590 // Update dominators. We know that MBB immediately dominates LoopBB, that
6591 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6592 // RemainderBB. RemainderBB immediately dominates all of the successors
6593 // transferred to it from MBB that MBB used to properly dominate.
6594 if (MDT) {
6595 MDT->addNewBlock(LoopBB, &MBB);
6596 MDT->addNewBlock(BodyBB, LoopBB);
6597 MDT->addNewBlock(RemainderBB, BodyBB);
6598 for (auto &Succ : RemainderBB->successors()) {
6599 if (MDT->properlyDominates(&MBB, Succ)) {
6600 MDT->changeImmediateDominator(Succ, RemainderBB);
6601 }
6602 }
6603 }
6604
6605 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
6606
6607 MachineBasicBlock::iterator First = RemainderBB->begin();
6608 // Restore SCC
6609 if (SCCNotDead) {
6610 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6611 .addReg(SaveSCCReg, RegState::Kill)
6612 .addImm(0);
6613 }
6614
6615 // Restore the EXEC mask
6616 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6617 return BodyBB;
6618}
6619
6620// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6621static std::tuple<unsigned, unsigned>
6623 MachineBasicBlock &MBB = *MI.getParent();
6624 MachineFunction &MF = *MBB.getParent();
6626
6627 // Extract the ptr from the resource descriptor.
6628 unsigned RsrcPtr =
6629 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6630 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6631
6632 // Create an empty resource descriptor
6633 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6634 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6635 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6636 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6637 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6638
6639 // Zero64 = 0
6640 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6641 .addImm(0);
6642
6643 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6644 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6645 .addImm(Lo_32(RsrcDataFormat));
6646
6647 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6648 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6649 .addImm(Hi_32(RsrcDataFormat));
6650
6651 // NewSRsrc = {Zero64, SRsrcFormat}
6652 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6653 .addReg(Zero64)
6654 .addImm(AMDGPU::sub0_sub1)
6655 .addReg(SRsrcFormatLo)
6656 .addImm(AMDGPU::sub2)
6657 .addReg(SRsrcFormatHi)
6658 .addImm(AMDGPU::sub3);
6659
6660 return std::tuple(RsrcPtr, NewSRsrc);
6661}
6662
6665 MachineDominatorTree *MDT) const {
6666 MachineFunction &MF = *MI.getParent()->getParent();
6668 MachineBasicBlock *CreatedBB = nullptr;
6669
6670 // Legalize VOP2
6671 if (isVOP2(MI) || isVOPC(MI)) {
6673 return CreatedBB;
6674 }
6675
6676 // Legalize VOP3
6677 if (isVOP3(MI)) {
6679 return CreatedBB;
6680 }
6681
6682 // Legalize SMRD
6683 if (isSMRD(MI)) {
6685 return CreatedBB;
6686 }
6687
6688 // Legalize FLAT
6689 if (isFLAT(MI)) {
6691 return CreatedBB;
6692 }
6693
6694 // Legalize REG_SEQUENCE and PHI
6695 // The register class of the operands much be the same type as the register
6696 // class of the output.
6697 if (MI.getOpcode() == AMDGPU::PHI) {
6698 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6699 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6700 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6701 continue;
6702 const TargetRegisterClass *OpRC =
6703 MRI.getRegClass(MI.getOperand(i).getReg());
6704 if (RI.hasVectorRegisters(OpRC)) {
6705 VRC = OpRC;
6706 } else {
6707 SRC = OpRC;
6708 }
6709 }
6710
6711 // If any of the operands are VGPR registers, then they all most be
6712 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6713 // them.
6714 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6715 if (!VRC) {
6716 assert(SRC);
6717 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6718 VRC = &AMDGPU::VReg_1RegClass;
6719 } else
6720 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6721 ? RI.getEquivalentAGPRClass(SRC)
6722 : RI.getEquivalentVGPRClass(SRC);
6723 } else {
6724 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6725 ? RI.getEquivalentAGPRClass(VRC)
6726 : RI.getEquivalentVGPRClass(VRC);
6727 }
6728 RC = VRC;
6729 } else {
6730 RC = SRC;
6731 }
6732
6733 // Update all the operands so they have the same type.
6734 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6735 MachineOperand &Op = MI.getOperand(I);
6736 if (!Op.isReg() || !Op.getReg().isVirtual())
6737 continue;
6738
6739 // MI is a PHI instruction.
6740 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6742
6743 // Avoid creating no-op copies with the same src and dst reg class. These
6744 // confuse some of the machine passes.
6745 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6746 }
6747 }
6748
6749 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6750 // VGPR dest type and SGPR sources, insert copies so all operands are
6751 // VGPRs. This seems to help operand folding / the register coalescer.
6752 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6753 MachineBasicBlock *MBB = MI.getParent();
6754 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6755 if (RI.hasVGPRs(DstRC)) {
6756 // Update all the operands so they are VGPR register classes. These may
6757 // not be the same register class because REG_SEQUENCE supports mixing
6758 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6759 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6760 MachineOperand &Op = MI.getOperand(I);
6761 if (!Op.isReg() || !Op.getReg().isVirtual())
6762 continue;
6763
6764 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6765 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6766 if (VRC == OpRC)
6767 continue;
6768
6769 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6770 Op.setIsKill();
6771 }
6772 }
6773
6774 return CreatedBB;
6775 }
6776
6777 // Legalize INSERT_SUBREG
6778 // src0 must have the same register class as dst
6779 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6780 Register Dst = MI.getOperand(0).getReg();
6781 Register Src0 = MI.getOperand(1).getReg();
6782 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6783 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6784 if (DstRC != Src0RC) {
6785 MachineBasicBlock *MBB = MI.getParent();
6786 MachineOperand &Op = MI.getOperand(1);
6787 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6788 }
6789 return CreatedBB;
6790 }
6791
6792 // Legalize SI_INIT_M0
6793 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6794 MachineOperand &Src = MI.getOperand(0);
6795 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6796 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6797 return CreatedBB;
6798 }
6799
6800 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6801 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6802 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6803 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6804 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6805 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
6806 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
6807 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
6808 MachineOperand &Src = MI.getOperand(1);
6809 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6810 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6811 return CreatedBB;
6812 }
6813
6814 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6815 //
6816 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6817 // scratch memory access. In both cases, the legalization never involves
6818 // conversion to the addr64 form.
6820 (isMUBUF(MI) || isMTBUF(MI)))) {
6821 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6822 : AMDGPU::OpName::srsrc;
6823 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6824 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6825 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6826
6827 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6828 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6829 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6830 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6831
6832 return CreatedBB;
6833 }
6834
6835 // Legalize SI_CALL
6836 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6837 MachineOperand *Dest = &MI.getOperand(0);
6838 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6839 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6840 // following copies, we also need to move copies from and to physical
6841 // registers into the loop block.
6842 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6843 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6844
6845 // Also move the copies to physical registers into the loop block
6846 MachineBasicBlock &MBB = *MI.getParent();
6848 while (Start->getOpcode() != FrameSetupOpcode)
6849 --Start;
6851 while (End->getOpcode() != FrameDestroyOpcode)
6852 ++End;
6853 // Also include following copies of the return value
6854 ++End;
6855 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6856 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
6857 ++End;
6858 CreatedBB =
6859 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6860 }
6861 }
6862
6863 // Legalize s_sleep_var.
6864 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6865 const DebugLoc &DL = MI.getDebugLoc();
6866 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6867 int Src0Idx =
6868 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6869 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6870 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6871 .add(Src0);
6872 Src0.ChangeToRegister(Reg, false);
6873 return nullptr;
6874 }
6875
6876 // Legalize MUBUF instructions.
6877 bool isSoffsetLegal = true;
6878 int SoffsetIdx =
6879 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6880 if (SoffsetIdx != -1) {
6881 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6882 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6883 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6884 isSoffsetLegal = false;
6885 }
6886 }
6887
6888 bool isRsrcLegal = true;
6889 int RsrcIdx =
6890 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6891 if (RsrcIdx != -1) {
6892 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6893 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
6894 isRsrcLegal = false;
6895 }
6896
6897 // The operands are legal.
6898 if (isRsrcLegal && isSoffsetLegal)
6899 return CreatedBB;
6900
6901 if (!isRsrcLegal) {
6902 // Legalize a VGPR Rsrc
6903 //
6904 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6905 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6906 // a zero-value SRsrc.
6907 //
6908 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6909 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6910 // above.
6911 //
6912 // Otherwise we are on non-ADDR64 hardware, and/or we have
6913 // idxen/offen/bothen and we fall back to a waterfall loop.
6914
6915 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6916 MachineBasicBlock &MBB = *MI.getParent();
6917
6918 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6919 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6920 // This is already an ADDR64 instruction so we need to add the pointer
6921 // extracted from the resource descriptor to the current value of VAddr.
6922 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6923 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6924 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6925
6926 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
6927 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
6928 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
6929
6930 unsigned RsrcPtr, NewSRsrc;
6931 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6932
6933 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
6934 const DebugLoc &DL = MI.getDebugLoc();
6935 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
6936 .addDef(CondReg0)
6937 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6938 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
6939 .addImm(0);
6940
6941 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
6942 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
6943 .addDef(CondReg1, RegState::Dead)
6944 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6945 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
6946 .addReg(CondReg0, RegState::Kill)
6947 .addImm(0);
6948
6949 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6950 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
6951 .addReg(NewVAddrLo)
6952 .addImm(AMDGPU::sub0)
6953 .addReg(NewVAddrHi)
6954 .addImm(AMDGPU::sub1);
6955
6956 VAddr->setReg(NewVAddr);
6957 Rsrc->setReg(NewSRsrc);
6958 } else if (!VAddr && ST.hasAddr64()) {
6959 // This instructions is the _OFFSET variant, so we need to convert it to
6960 // ADDR64.
6962 "FIXME: Need to emit flat atomics here");
6963
6964 unsigned RsrcPtr, NewSRsrc;
6965 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6966
6967 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6968 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
6969 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
6970 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6971 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
6972
6973 // Atomics with return have an additional tied operand and are
6974 // missing some of the special bits.
6975 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
6976 MachineInstr *Addr64;
6977
6978 if (!VDataIn) {
6979 // Regular buffer load / store.
6981 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6982 .add(*VData)
6983 .addReg(NewVAddr)
6984 .addReg(NewSRsrc)
6985 .add(*SOffset)
6986 .add(*Offset);
6987
6988 if (const MachineOperand *CPol =
6989 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6990 MIB.addImm(CPol->getImm());
6991 }
6992
6993 if (const MachineOperand *TFE =
6994 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
6995 MIB.addImm(TFE->getImm());
6996 }
6997
6998 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
6999
7000 MIB.cloneMemRefs(MI);
7001 Addr64 = MIB;
7002 } else {
7003 // Atomics with return.
7004 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7005 .add(*VData)
7006 .add(*VDataIn)
7007 .addReg(NewVAddr)
7008 .addReg(NewSRsrc)
7009 .add(*SOffset)
7010 .add(*Offset)
7011 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7012 .cloneMemRefs(MI);
7013 }
7014
7015 MI.removeFromParent();
7016
7017 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7018 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7019 NewVAddr)
7020 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7021 .addImm(AMDGPU::sub0)
7022 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7023 .addImm(AMDGPU::sub1);
7024 } else {
7025 // Legalize a VGPR Rsrc and soffset together.
7026 if (!isSoffsetLegal) {
7027 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7028 CreatedBB =
7029 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7030 return CreatedBB;
7031 }
7032 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7033 return CreatedBB;
7034 }
7035 }
7036
7037 // Legalize a VGPR soffset.
7038 if (!isSoffsetLegal) {
7039 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7040 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7041 return CreatedBB;
7042 }
7043 return CreatedBB;
7044}
7045
7047 InstrList.insert(MI);
7048 // Add MBUF instructiosn to deferred list.
7049 int RsrcIdx =
7050 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7051 if (RsrcIdx != -1) {
7052 DeferredList.insert(MI);
7053 }
7054}
7055
7057 return DeferredList.contains(MI);
7058}
7059
7061 MachineDominatorTree *MDT) const {
7062
7063 while (!Worklist.empty()) {
7064 MachineInstr &Inst = *Worklist.top();
7065 Worklist.erase_top();
7066 // Skip MachineInstr in the deferred list.
7067 if (Worklist.isDeferred(&Inst))
7068 continue;
7069 moveToVALUImpl(Worklist, MDT, Inst);
7070 }
7071
7072 // Deferred list of instructions will be processed once
7073 // all the MachineInstr in the worklist are done.
7074 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7075 moveToVALUImpl(Worklist, MDT, *Inst);
7076 assert(Worklist.empty() &&
7077 "Deferred MachineInstr are not supposed to re-populate worklist");
7078 }
7079}
7080
7083 MachineInstr &Inst) const {
7084
7086 if (!MBB)
7087 return;
7089 unsigned Opcode = Inst.getOpcode();
7090 unsigned NewOpcode = getVALUOp(Inst);
7091 // Handle some special cases
7092 switch (Opcode) {
7093 default:
7094 break;
7095 case AMDGPU::S_ADD_U64_PSEUDO:
7096 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
7097 break;
7098 case AMDGPU::S_SUB_U64_PSEUDO:
7099 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
7100 break;
7101 case AMDGPU::S_ADD_I32:
7102 case AMDGPU::S_SUB_I32: {
7103 // FIXME: The u32 versions currently selected use the carry.
7104 bool Changed;
7105 MachineBasicBlock *CreatedBBTmp = nullptr;
7106 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7107 if (Changed)
7108 return;
7109
7110 // Default handling
7111 break;
7112 }
7113
7114 case AMDGPU::S_MUL_U64:
7115 // Split s_mul_u64 in 32-bit vector multiplications.
7116 splitScalarSMulU64(Worklist, Inst, MDT);
7117 Inst.eraseFromParent();
7118 return;
7119
7120 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7121 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7122 // This is a special case of s_mul_u64 where all the operands are either
7123 // zero extended or sign extended.
7124 splitScalarSMulPseudo(Worklist, Inst, MDT);
7125 Inst.eraseFromParent();
7126 return;
7127
7128 case AMDGPU::S_AND_B64:
7129 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7130 Inst.eraseFromParent();
7131 return;
7132
7133 case AMDGPU::S_OR_B64:
7134 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7135 Inst.eraseFromParent();
7136 return;
7137
7138 case AMDGPU::S_XOR_B64:
7139 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7140 Inst.eraseFromParent();
7141 return;
7142
7143 case AMDGPU::S_NAND_B64:
7144 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7145 Inst.eraseFromParent();
7146 return;
7147
7148 case AMDGPU::S_NOR_B64:
7149 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7150 Inst.eraseFromParent();
7151 return;
7152
7153 case AMDGPU::S_XNOR_B64:
7154 if (ST.hasDLInsts())
7155 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7156 else
7157 splitScalar64BitXnor(Worklist, Inst, MDT);
7158 Inst.eraseFromParent();
7159 return;
7160
7161 case AMDGPU::S_ANDN2_B64:
7162 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7163 Inst.eraseFromParent();
7164 return;
7165
7166 case AMDGPU::S_ORN2_B64:
7167 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7168 Inst.eraseFromParent();
7169 return;
7170
7171 case AMDGPU::S_BREV_B64:
7172 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7173 Inst.eraseFromParent();
7174 return;
7175
7176 case AMDGPU::S_NOT_B64:
7177 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7178 Inst.eraseFromParent();
7179 return;
7180
7181 case AMDGPU::S_BCNT1_I32_B64:
7182 splitScalar64BitBCNT(Worklist, Inst);
7183 Inst.eraseFromParent();
7184 return;
7185
7186 case AMDGPU::S_BFE_I64:
7187 splitScalar64BitBFE(Worklist, Inst);
7188 Inst.eraseFromParent();
7189 return;
7190
7191 case AMDGPU::S_FLBIT_I32_B64:
7192 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7193 Inst.eraseFromParent();
7194 return;
7195 case AMDGPU::S_FF1_I32_B64:
7196 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7197 Inst.eraseFromParent();
7198 return;
7199
7200 case AMDGPU::S_LSHL_B32:
7201 if (ST.hasOnlyRevVALUShifts()) {
7202 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7203 swapOperands(Inst);
7204 }
7205 break;
7206 case AMDGPU::S_ASHR_I32:
7207 if (ST.hasOnlyRevVALUShifts()) {
7208 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7209 swapOperands(Inst);
7210 }
7211 break;
7212 case AMDGPU::S_LSHR_B32:
7213 if (ST.hasOnlyRevVALUShifts()) {
7214 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7215 swapOperands(Inst);
7216 }
7217 break;
7218 case AMDGPU::S_LSHL_B64:
7219 if (ST.hasOnlyRevVALUShifts()) {
7220 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7221 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7222 : AMDGPU::V_LSHLREV_B64_e64;
7223 swapOperands(Inst);
7224 }
7225 break;
7226 case AMDGPU::S_ASHR_I64:
7227 if (ST.hasOnlyRevVALUShifts()) {
7228 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7229 swapOperands(Inst);
7230 }
7231 break;
7232 case AMDGPU::S_LSHR_B64:
7233 if (ST.hasOnlyRevVALUShifts()) {
7234 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7235 swapOperands(Inst);
7236 }
7237 break;
7238
7239 case AMDGPU::S_ABS_I32:
7240 lowerScalarAbs(Worklist, Inst);
7241 Inst.eraseFromParent();
7242 return;
7243
7244 case AMDGPU::S_CBRANCH_SCC0:
7245 case AMDGPU::S_CBRANCH_SCC1: {
7246 // Clear unused bits of vcc
7247 Register CondReg = Inst.getOperand(1).getReg();
7248 bool IsSCC = CondReg == AMDGPU::SCC;
7249 Register VCC = RI.getVCC();
7250 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7251 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7252 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7253 .addReg(EXEC)
7254 .addReg(IsSCC ? VCC : CondReg);
7255 Inst.removeOperand(1);
7256 } break;
7257
7258 case AMDGPU::S_BFE_U64:
7259 case AMDGPU::S_BFM_B64:
7260 llvm_unreachable("Moving this op to VALU not implemented");
7261
7262 case AMDGPU::S_PACK_LL_B32_B16:
7263 case AMDGPU::S_PACK_LH_B32_B16:
7264 case AMDGPU::S_PACK_HL_B32_B16:
7265 case AMDGPU::S_PACK_HH_B32_B16:
7266 movePackToVALU(Worklist, MRI, Inst);
7267 Inst.eraseFromParent();
7268 return;
7269
7270 case AMDGPU::S_XNOR_B32:
7271 lowerScalarXnor(Worklist, Inst);
7272 Inst.eraseFromParent();
7273 return;
7274
7275 case AMDGPU::S_NAND_B32:
7276 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7277 Inst.eraseFromParent();
7278 return;
7279
7280 case AMDGPU::S_NOR_B32:
7281 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7282 Inst.eraseFromParent();
7283 return;
7284
7285 case AMDGPU::S_ANDN2_B32:
7286 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7287 Inst.eraseFromParent();
7288 return;
7289
7290 case AMDGPU::S_ORN2_B32:
7291 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7292 Inst.eraseFromParent();
7293 return;
7294
7295 // TODO: remove as soon as everything is ready
7296 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7297 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7298 // can only be selected from the uniform SDNode.
7299 case AMDGPU::S_ADD_CO_PSEUDO:
7300 case AMDGPU::S_SUB_CO_PSEUDO: {
7301 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7302 ? AMDGPU::V_ADDC_U32_e64
7303 : AMDGPU::V_SUBB_U32_e64;
7304 const auto *CarryRC = RI.getWaveMaskRegClass();
7305
7306 Register CarryInReg = Inst.getOperand(4).getReg();
7307 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7308 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7309 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7310 .addReg(CarryInReg);
7311 }
7312
7313 Register CarryOutReg = Inst.getOperand(1).getReg();
7314
7315 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7316 MRI.getRegClass(Inst.getOperand(0).getReg())));
7317 MachineInstr *CarryOp =
7318 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7319 .addReg(CarryOutReg, RegState::Define)
7320 .add(Inst.getOperand(2))
7321 .add(Inst.getOperand(3))
7322 .addReg(CarryInReg)
7323 .addImm(0);
7324 legalizeOperands(*CarryOp);
7325 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7326 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7327 Inst.eraseFromParent();
7328 }
7329 return;
7330 case AMDGPU::S_UADDO_PSEUDO:
7331 case AMDGPU::S_USUBO_PSEUDO: {
7332 const DebugLoc &DL = Inst.getDebugLoc();
7333 MachineOperand &Dest0 = Inst.getOperand(0);
7334 MachineOperand &Dest1 = Inst.getOperand(1);
7335 MachineOperand &Src0 = Inst.getOperand(2);
7336 MachineOperand &Src1 = Inst.getOperand(3);
7337
7338 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7339 ? AMDGPU::V_ADD_CO_U32_e64
7340 : AMDGPU::V_SUB_CO_U32_e64;
7341 const TargetRegisterClass *NewRC =
7342 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7343 Register DestReg = MRI.createVirtualRegister(NewRC);
7344 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7345 .addReg(Dest1.getReg(), RegState::Define)
7346 .add(Src0)
7347 .add(Src1)
7348 .addImm(0); // clamp bit
7349
7350 legalizeOperands(*NewInstr, MDT);
7351 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7352 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7353 Worklist);
7354 Inst.eraseFromParent();
7355 }
7356 return;
7357
7358 case AMDGPU::S_CSELECT_B32:
7359 case AMDGPU::S_CSELECT_B64:
7360 lowerSelect(Worklist, Inst, MDT);
7361 Inst.eraseFromParent();
7362 return;
7363 case AMDGPU::S_CMP_EQ_I32:
7364 case AMDGPU::S_CMP_LG_I32:
7365 case AMDGPU::S_CMP_GT_I32:
7366 case AMDGPU::S_CMP_GE_I32:
7367 case AMDGPU::S_CMP_LT_I32:
7368 case AMDGPU::S_CMP_LE_I32:
7369 case AMDGPU::S_CMP_EQ_U32:
7370 case AMDGPU::S_CMP_LG_U32:
7371 case AMDGPU::S_CMP_GT_U32:
7372 case AMDGPU::S_CMP_GE_U32:
7373 case AMDGPU::S_CMP_LT_U32:
7374 case AMDGPU::S_CMP_LE_U32:
7375 case AMDGPU::S_CMP_EQ_U64:
7376 case AMDGPU::S_CMP_LG_U64:
7377 case AMDGPU::S_CMP_LT_F32:
7378 case AMDGPU::S_CMP_EQ_F32:
7379 case AMDGPU::S_CMP_LE_F32:
7380 case AMDGPU::S_CMP_GT_F32:
7381 case AMDGPU::S_CMP_LG_F32:
7382 case AMDGPU::S_CMP_GE_F32:
7383 case AMDGPU::S_CMP_O_F32:
7384 case AMDGPU::S_CMP_U_F32:
7385 case AMDGPU::S_CMP_NGE_F32:
7386 case AMDGPU::S_CMP_NLG_F32:
7387 case AMDGPU::S_CMP_NGT_F32:
7388 case AMDGPU::S_CMP_NLE_F32:
7389 case AMDGPU::S_CMP_NEQ_F32:
7390 case AMDGPU::S_CMP_NLT_F32: {
7391 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7392 auto NewInstr =
7393 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7394 .setMIFlags(Inst.getFlags());
7395 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7396 0) {
7397 NewInstr
7398 .addImm(0) // src0_modifiers
7399 .add(Inst.getOperand(0)) // src0
7400 .addImm(0) // src1_modifiers
7401 .add(Inst.getOperand(1)) // src1
7402 .addImm(0); // clamp
7403 } else {
7404 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7405 }
7406 legalizeOperands(*NewInstr, MDT);
7407 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7408 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7409 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7410 Inst.eraseFromParent();
7411 return;
7412 }
7413 case AMDGPU::S_CMP_LT_F16:
7414 case AMDGPU::S_CMP_EQ_F16:
7415 case AMDGPU::S_CMP_LE_F16:
7416 case AMDGPU::S_CMP_GT_F16:
7417 case AMDGPU::S_CMP_LG_F16:
7418 case AMDGPU::S_CMP_GE_F16:
7419 case AMDGPU::S_CMP_O_F16:
7420 case AMDGPU::S_CMP_U_F16:
7421 case AMDGPU::S_CMP_NGE_F16:
7422 case AMDGPU::S_CMP_NLG_F16:
7423 case AMDGPU::S_CMP_NGT_F16:
7424 case AMDGPU::S_CMP_NLE_F16:
7425 case AMDGPU::S_CMP_NEQ_F16:
7426 case AMDGPU::S_CMP_NLT_F16: {
7427 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7428 auto NewInstr =
7429 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7430 .setMIFlags(Inst.getFlags());
7431 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7432 NewInstr
7433 .addImm(0) // src0_modifiers
7434 .add(Inst.getOperand(0)) // src0
7435 .addImm(0) // src1_modifiers
7436 .add(Inst.getOperand(1)) // src1
7437 .addImm(0); // clamp
7438 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7439 NewInstr.addImm(0); // op_sel0
7440 } else {
7441 NewInstr
7442 .add(Inst.getOperand(0))
7443 .add(Inst.getOperand(1));
7444 }
7445 legalizeOperands(*NewInstr, MDT);
7446 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7447 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7448 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7449 Inst.eraseFromParent();
7450 return;
7451 }
7452 case AMDGPU::S_CVT_HI_F32_F16: {
7453 const DebugLoc &DL = Inst.getDebugLoc();
7454 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7455 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7456 if (ST.useRealTrue16Insts()) {
7457 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7458 .add(Inst.getOperand(1));
7459 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7460 .addImm(0) // src0_modifiers
7461 .addReg(TmpReg, 0, AMDGPU::hi16)
7462 .addImm(0) // clamp
7463 .addImm(0) // omod
7464 .addImm(0); // op_sel0
7465 } else {
7466 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7467 .addImm(16)
7468 .add(Inst.getOperand(1));
7469 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7470 .addImm(0) // src0_modifiers
7471 .addReg(TmpReg)
7472 .addImm(0) // clamp
7473 .addImm(0); // omod
7474 }
7475
7476 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7477 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7478 Inst.eraseFromParent();
7479 return;
7480 }
7481 case AMDGPU::S_MINIMUM_F32:
7482 case AMDGPU::S_MAXIMUM_F32:
7483 case AMDGPU::S_MINIMUM_F16:
7484 case AMDGPU::S_MAXIMUM_F16: {
7485 const DebugLoc &DL = Inst.getDebugLoc();
7486 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7487 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7488 .addImm(0) // src0_modifiers
7489 .add(Inst.getOperand(1))
7490 .addImm(0) // src1_modifiers
7491 .add(Inst.getOperand(2))
7492 .addImm(0) // clamp
7493 .addImm(0); // omod
7494 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7495
7496 legalizeOperands(*NewInstr, MDT);
7497 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7498 Inst.eraseFromParent();
7499 return;
7500 }
7501 }
7502
7503 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7504 // We cannot move this instruction to the VALU, so we should try to
7505 // legalize its operands instead.
7506 legalizeOperands(Inst, MDT);
7507 return;
7508 }
7509 // Handle converting generic instructions like COPY-to-SGPR into
7510 // COPY-to-VGPR.
7511 if (NewOpcode == Opcode) {
7512 Register DstReg = Inst.getOperand(0).getReg();
7513 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7514
7515 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7516 // hope for the best.
7517 if (Inst.isCopy() && DstReg.isPhysical() &&
7518 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7519 // TODO: Only works for 32 bit registers.
7520 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7521 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7522 .add(Inst.getOperand(1));
7523 Inst.eraseFromParent();
7524 return;
7525 }
7526
7527 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7528 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7529 // Instead of creating a copy where src and dst are the same register
7530 // class, we just replace all uses of dst with src. These kinds of
7531 // copies interfere with the heuristics MachineSink uses to decide
7532 // whether or not to split a critical edge. Since the pass assumes
7533 // that copies will end up as machine instructions and not be
7534 // eliminated.
7535 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7536 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7537 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7538 Inst.getOperand(0).setReg(DstReg);
7539 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7540 // these are deleted later, but at -O0 it would leave a suspicious
7541 // looking illegal copy of an undef register.
7542 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7543 Inst.removeOperand(I);
7544 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7545 return;
7546 }
7547 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7548 MRI.replaceRegWith(DstReg, NewDstReg);
7549 legalizeOperands(Inst, MDT);
7550 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7551 return;
7552 }
7553
7554 // Use the new VALU Opcode.
7555 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7556 .setMIFlags(Inst.getFlags());
7557 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7558 // Intersperse VOP3 modifiers among the SALU operands.
7559 NewInstr->addOperand(Inst.getOperand(0));
7560 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7561 AMDGPU::OpName::src0_modifiers) >= 0)
7562 NewInstr.addImm(0);
7563 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7564 MachineOperand Src = Inst.getOperand(1);
7565 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7566 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7567 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7568 else
7569 NewInstr->addOperand(Src);
7570 }
7571
7572 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7573 // We are converting these to a BFE, so we need to add the missing
7574 // operands for the size and offset.
7575 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7576 NewInstr.addImm(0);
7577 NewInstr.addImm(Size);
7578 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7579 // The VALU version adds the second operand to the result, so insert an
7580 // extra 0 operand.
7581 NewInstr.addImm(0);
7582 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7583 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7584 // If we need to move this to VGPRs, we need to unpack the second
7585 // operand back into the 2 separate ones for bit offset and width.
7586 assert(OffsetWidthOp.isImm() &&
7587 "Scalar BFE is only implemented for constant width and offset");
7588 uint32_t Imm = OffsetWidthOp.getImm();
7589
7590 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7591 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7592 NewInstr.addImm(Offset);
7593 NewInstr.addImm(BitWidth);
7594 } else {
7595 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7596 AMDGPU::OpName::src1_modifiers) >= 0)
7597 NewInstr.addImm(0);
7598 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7599 NewInstr->addOperand(Inst.getOperand(2));
7600 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7601 AMDGPU::OpName::src2_modifiers) >= 0)
7602 NewInstr.addImm(0);
7603 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7604 NewInstr->addOperand(Inst.getOperand(3));
7605 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7606 NewInstr.addImm(0);
7607 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7608 NewInstr.addImm(0);
7609 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7610 NewInstr.addImm(0);
7611 }
7612 } else {
7613 // Just copy the SALU operands.
7614 for (const MachineOperand &Op : Inst.explicit_operands())
7615 NewInstr->addOperand(Op);
7616 }
7617
7618 // Remove any references to SCC. Vector instructions can't read from it, and
7619 // We're just about to add the implicit use / defs of VCC, and we don't want
7620 // both.
7621 for (MachineOperand &Op : Inst.implicit_operands()) {
7622 if (Op.getReg() == AMDGPU::SCC) {
7623 // Only propagate through live-def of SCC.
7624 if (Op.isDef() && !Op.isDead())
7625 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7626 if (Op.isUse())
7627 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7628 }
7629 }
7630 Inst.eraseFromParent();
7631 Register NewDstReg;
7632 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7633 Register DstReg = NewInstr->getOperand(0).getReg();
7634 assert(DstReg.isVirtual());
7635 // Update the destination register class.
7636 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7637 assert(NewDstRC);
7638 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7639 MRI.replaceRegWith(DstReg, NewDstReg);
7640 }
7641 fixImplicitOperands(*NewInstr);
7642 // Legalize the operands
7643 legalizeOperands(*NewInstr, MDT);
7644 if (NewDstReg)
7645 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7646}
7647
7648// Add/sub require special handling to deal with carry outs.
7649std::pair<bool, MachineBasicBlock *>
7650SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7651 MachineDominatorTree *MDT) const {
7652 if (ST.hasAddNoCarry()) {
7653 // Assume there is no user of scc since we don't select this in that case.
7654 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7655 // is used.
7656
7657 MachineBasicBlock &MBB = *Inst.getParent();
7659
7660 Register OldDstReg = Inst.getOperand(0).getReg();
7661 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7662
7663 unsigned Opc = Inst.getOpcode();
7664 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7665
7666 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7667 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7668
7669 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7670 Inst.removeOperand(3);
7671
7672 Inst.setDesc(get(NewOpc));
7673 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7675 MRI.replaceRegWith(OldDstReg, ResultReg);
7676 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7677
7678 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7679 return std::pair(true, NewBB);
7680 }
7681
7682 return std::pair(false, nullptr);
7683}
7684
7685void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7686 MachineDominatorTree *MDT) const {
7687
7688 MachineBasicBlock &MBB = *Inst.getParent();
7690 MachineBasicBlock::iterator MII = Inst;
7691 DebugLoc DL = Inst.getDebugLoc();
7692
7693 MachineOperand &Dest = Inst.getOperand(0);
7694 MachineOperand &Src0 = Inst.getOperand(1);
7695 MachineOperand &Src1 = Inst.getOperand(2);
7696 MachineOperand &Cond = Inst.getOperand(3);
7697
7698 Register CondReg = Cond.getReg();
7699 bool IsSCC = (CondReg == AMDGPU::SCC);
7700
7701 // If this is a trivial select where the condition is effectively not SCC
7702 // (CondReg is a source of copy to SCC), then the select is semantically
7703 // equivalent to copying CondReg. Hence, there is no need to create
7704 // V_CNDMASK, we can just use that and bail out.
7705 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7706 (Src1.getImm() == 0)) {
7707 MRI.replaceRegWith(Dest.getReg(), CondReg);
7708 return;
7709 }
7710
7711 Register NewCondReg = CondReg;
7712 if (IsSCC) {
7714 NewCondReg = MRI.createVirtualRegister(TC);
7715
7716 // Now look for the closest SCC def if it is a copy
7717 // replacing the CondReg with the COPY source register
7718 bool CopyFound = false;
7719 for (MachineInstr &CandI :
7721 Inst.getParent()->rend())) {
7722 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
7723 -1) {
7724 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7725 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7726 .addReg(CandI.getOperand(1).getReg());
7727 CopyFound = true;
7728 }
7729 break;
7730 }
7731 }
7732 if (!CopyFound) {
7733 // SCC def is not a copy
7734 // Insert a trivial select instead of creating a copy, because a copy from
7735 // SCC would semantically mean just copying a single bit, but we may need
7736 // the result to be a vector condition mask that needs preserving.
7737 unsigned Opcode =
7738 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
7739 auto NewSelect =
7740 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7741 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7742 }
7743 }
7744
7745 Register NewDestReg = MRI.createVirtualRegister(
7746 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7747 MachineInstr *NewInst;
7748 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7749 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7750 .addImm(0)
7751 .add(Src1) // False
7752 .addImm(0)
7753 .add(Src0) // True
7754 .addReg(NewCondReg);
7755 } else {
7756 NewInst =
7757 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7758 .add(Src1) // False
7759 .add(Src0) // True
7760 .addReg(NewCondReg);
7761 }
7762 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7763 legalizeOperands(*NewInst, MDT);
7764 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7765}
7766
7767void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7768 MachineInstr &Inst) const {
7769 MachineBasicBlock &MBB = *Inst.getParent();
7771 MachineBasicBlock::iterator MII = Inst;
7772 DebugLoc DL = Inst.getDebugLoc();
7773
7774 MachineOperand &Dest = Inst.getOperand(0);
7775 MachineOperand &Src = Inst.getOperand(1);
7776 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7777 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7778
7779 unsigned SubOp = ST.hasAddNoCarry() ?
7780 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7781
7782 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7783 .addImm(0)
7784 .addReg(Src.getReg());
7785
7786 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7787 .addReg(Src.getReg())
7788 .addReg(TmpReg);
7789
7790 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7791 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7792}
7793
7794void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7795 MachineInstr &Inst) const {
7796 MachineBasicBlock &MBB = *Inst.getParent();
7798 MachineBasicBlock::iterator MII = Inst;
7799 const DebugLoc &DL = Inst.getDebugLoc();
7800
7801 MachineOperand &Dest = Inst.getOperand(0);
7802 MachineOperand &Src0 = Inst.getOperand(1);
7803 MachineOperand &Src1 = Inst.getOperand(2);
7804
7805 if (ST.hasDLInsts()) {
7806 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7807 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7808 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7809
7810 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7811 .add(Src0)
7812 .add(Src1);
7813
7814 MRI.replaceRegWith(Dest.getReg(), NewDest);
7815 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7816 } else {
7817 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7818 // invert either source and then perform the XOR. If either source is a
7819 // scalar register, then we can leave the inversion on the scalar unit to
7820 // achieve a better distribution of scalar and vector instructions.
7821 bool Src0IsSGPR = Src0.isReg() &&
7822 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7823 bool Src1IsSGPR = Src1.isReg() &&
7824 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7826 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7827 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7828
7829 // Build a pair of scalar instructions and add them to the work list.
7830 // The next iteration over the work list will lower these to the vector
7831 // unit as necessary.
7832 if (Src0IsSGPR) {
7833 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7834 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7835 .addReg(Temp)
7836 .add(Src1);
7837 } else if (Src1IsSGPR) {
7838 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7839 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7840 .add(Src0)
7841 .addReg(Temp);
7842 } else {
7843 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7844 .add(Src0)
7845 .add(Src1);
7846 MachineInstr *Not =
7847 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7848 Worklist.insert(Not);
7849 }
7850
7851 MRI.replaceRegWith(Dest.getReg(), NewDest);
7852
7853 Worklist.insert(Xor);
7854
7855 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7856 }
7857}
7858
7859void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7860 MachineInstr &Inst,
7861 unsigned Opcode) const {
7862 MachineBasicBlock &MBB = *Inst.getParent();
7864 MachineBasicBlock::iterator MII = Inst;
7865 const DebugLoc &DL = Inst.getDebugLoc();
7866
7867 MachineOperand &Dest = Inst.getOperand(0);
7868 MachineOperand &Src0 = Inst.getOperand(1);
7869 MachineOperand &Src1 = Inst.getOperand(2);
7870
7871 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7872 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7873
7874 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7875 .add(Src0)
7876 .add(Src1);
7877
7878 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7879 .addReg(Interm);
7880
7881 Worklist.insert(&Op);
7882 Worklist.insert(&Not);
7883
7884 MRI.replaceRegWith(Dest.getReg(), NewDest);
7885 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7886}
7887
7888void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7889 MachineInstr &Inst,
7890 unsigned Opcode) const {
7891 MachineBasicBlock &MBB = *Inst.getParent();
7893 MachineBasicBlock::iterator MII = Inst;
7894 const DebugLoc &DL = Inst.getDebugLoc();
7895
7896 MachineOperand &Dest = Inst.getOperand(0);
7897 MachineOperand &Src0 = Inst.getOperand(1);
7898 MachineOperand &Src1 = Inst.getOperand(2);
7899
7900 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7901 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7902
7903 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7904 .add(Src1);
7905
7906 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7907 .add(Src0)
7908 .addReg(Interm);
7909
7910 Worklist.insert(&Not);
7911 Worklist.insert(&Op);
7912
7913 MRI.replaceRegWith(Dest.getReg(), NewDest);
7914 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7915}
7916
7917void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
7918 MachineInstr &Inst, unsigned Opcode,
7919 bool Swap) const {
7920 MachineBasicBlock &MBB = *Inst.getParent();
7922
7923 MachineOperand &Dest = Inst.getOperand(0);
7924 MachineOperand &Src0 = Inst.getOperand(1);
7925 DebugLoc DL = Inst.getDebugLoc();
7926
7927 MachineBasicBlock::iterator MII = Inst;
7928
7929 const MCInstrDesc &InstDesc = get(Opcode);
7930 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7931 MRI.getRegClass(Src0.getReg()) :
7932 &AMDGPU::SGPR_32RegClass;
7933
7934 const TargetRegisterClass *Src0SubRC =
7935 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7936
7937 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7938 AMDGPU::sub0, Src0SubRC);
7939
7940 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7941 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7942 const TargetRegisterClass *NewDestSubRC =
7943 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7944
7945 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7946 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
7947
7948 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7949 AMDGPU::sub1, Src0SubRC);
7950
7951 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
7952 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
7953
7954 if (Swap)
7955 std::swap(DestSub0, DestSub1);
7956
7957 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
7958 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7959 .addReg(DestSub0)
7960 .addImm(AMDGPU::sub0)
7961 .addReg(DestSub1)
7962 .addImm(AMDGPU::sub1);
7963
7964 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7965
7966 Worklist.insert(&LoHalf);
7967 Worklist.insert(&HiHalf);
7968
7969 // We don't need to legalizeOperands here because for a single operand, src0
7970 // will support any kind of input.
7971
7972 // Move all users of this moved value.
7973 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7974}
7975
7976// There is not a vector equivalent of s_mul_u64. For this reason, we need to
7977// split the s_mul_u64 in 32-bit vector multiplications.
7978void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
7979 MachineInstr &Inst,
7980 MachineDominatorTree *MDT) const {
7981 MachineBasicBlock &MBB = *Inst.getParent();
7983
7984 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7985 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7986 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7987
7988 MachineOperand &Dest = Inst.getOperand(0);
7989 MachineOperand &Src0 = Inst.getOperand(1);
7990 MachineOperand &Src1 = Inst.getOperand(2);
7991 const DebugLoc &DL = Inst.getDebugLoc();
7992 MachineBasicBlock::iterator MII = Inst;
7993
7994 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7995 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7996 const TargetRegisterClass *Src0SubRC =
7997 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7998 if (RI.isSGPRClass(Src0SubRC))
7999 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8000 const TargetRegisterClass *Src1SubRC =
8001 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8002 if (RI.isSGPRClass(Src1SubRC))
8003 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8004
8005 // First, we extract the low 32-bit and high 32-bit values from each of the
8006 // operands.
8007 MachineOperand Op0L =
8008 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8009 MachineOperand Op1L =
8010 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8011 MachineOperand Op0H =
8012 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8013 MachineOperand Op1H =
8014 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8015
8016 // The multilication is done as follows:
8017 //
8018 // Op1H Op1L
8019 // * Op0H Op0L
8020 // --------------------
8021 // Op1H*Op0L Op1L*Op0L
8022 // + Op1H*Op0H Op1L*Op0H
8023 // -----------------------------------------
8024 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8025 //
8026 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8027 // value and that would overflow.
8028 // The low 32-bit value is Op1L*Op0L.
8029 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8030
8031 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8032 MachineInstr *Op1L_Op0H =
8033 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8034 .add(Op1L)
8035 .add(Op0H);
8036
8037 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8038 MachineInstr *Op1H_Op0L =
8039 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8040 .add(Op1H)
8041 .add(Op0L);
8042
8043 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8044 MachineInstr *Carry =
8045 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8046 .add(Op1L)
8047 .add(Op0L);
8048
8049 MachineInstr *LoHalf =
8050 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8051 .add(Op1L)
8052 .add(Op0L);
8053
8054 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8055 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8056 .addReg(Op1L_Op0H_Reg)
8057 .addReg(Op1H_Op0L_Reg);
8058
8059 MachineInstr *HiHalf =
8060 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8061 .addReg(AddReg)
8062 .addReg(CarryReg);
8063
8064 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8065 .addReg(DestSub0)
8066 .addImm(AMDGPU::sub0)
8067 .addReg(DestSub1)
8068 .addImm(AMDGPU::sub1);
8069
8070 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8071
8072 // Try to legalize the operands in case we need to swap the order to keep it
8073 // valid.
8074 legalizeOperands(*Op1L_Op0H, MDT);
8075 legalizeOperands(*Op1H_Op0L, MDT);
8076 legalizeOperands(*Carry, MDT);
8077 legalizeOperands(*LoHalf, MDT);
8078 legalizeOperands(*Add, MDT);
8079 legalizeOperands(*HiHalf, MDT);
8080
8081 // Move all users of this moved value.
8082 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8083}
8084
8085// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8086// multiplications.
8087void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8088 MachineInstr &Inst,
8089 MachineDominatorTree *MDT) const {
8090 MachineBasicBlock &MBB = *Inst.getParent();
8092
8093 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8094 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8095 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8096
8097 MachineOperand &Dest = Inst.getOperand(0);
8098 MachineOperand &Src0 = Inst.getOperand(1);
8099 MachineOperand &Src1 = Inst.getOperand(2);
8100 const DebugLoc &DL = Inst.getDebugLoc();
8101 MachineBasicBlock::iterator MII = Inst;
8102
8103 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8104 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8105 const TargetRegisterClass *Src0SubRC =
8106 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8107 if (RI.isSGPRClass(Src0SubRC))
8108 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8109 const TargetRegisterClass *Src1SubRC =
8110 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8111 if (RI.isSGPRClass(Src1SubRC))
8112 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8113
8114 // First, we extract the low 32-bit and high 32-bit values from each of the
8115 // operands.
8116 MachineOperand Op0L =
8117 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8118 MachineOperand Op1L =
8119 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8120
8121 unsigned Opc = Inst.getOpcode();
8122 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8123 ? AMDGPU::V_MUL_HI_U32_e64
8124 : AMDGPU::V_MUL_HI_I32_e64;
8125 MachineInstr *HiHalf =
8126 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8127
8128 MachineInstr *LoHalf =
8129 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8130 .add(Op1L)
8131 .add(Op0L);
8132
8133 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8134 .addReg(DestSub0)
8135 .addImm(AMDGPU::sub0)
8136 .addReg(DestSub1)
8137 .addImm(AMDGPU::sub1);
8138
8139 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8140
8141 // Try to legalize the operands in case we need to swap the order to keep it
8142 // valid.
8143 legalizeOperands(*HiHalf, MDT);
8144 legalizeOperands(*LoHalf, MDT);
8145
8146 // Move all users of this moved value.
8147 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8148}
8149
8150void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8151 MachineInstr &Inst, unsigned Opcode,
8152 MachineDominatorTree *MDT) const {
8153 MachineBasicBlock &MBB = *Inst.getParent();
8155
8156 MachineOperand &Dest = Inst.getOperand(0);
8157 MachineOperand &Src0 = Inst.getOperand(1);
8158 MachineOperand &Src1 = Inst.getOperand(2);
8159 DebugLoc DL = Inst.getDebugLoc();
8160
8161 MachineBasicBlock::iterator MII = Inst;
8162
8163 const MCInstrDesc &InstDesc = get(Opcode);
8164 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8165 MRI.getRegClass(Src0.getReg()) :
8166 &AMDGPU::SGPR_32RegClass;
8167
8168 const TargetRegisterClass *Src0SubRC =
8169 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8170 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8171 MRI.getRegClass(Src1.getReg()) :
8172 &AMDGPU::SGPR_32RegClass;
8173
8174 const TargetRegisterClass *Src1SubRC =
8175 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8176
8177 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8178 AMDGPU::sub0, Src0SubRC);
8179 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8180 AMDGPU::sub0, Src1SubRC);
8181 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8182 AMDGPU::sub1, Src0SubRC);
8183 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8184 AMDGPU::sub1, Src1SubRC);
8185
8186 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8187 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8188 const TargetRegisterClass *NewDestSubRC =
8189 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8190
8191 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8192 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8193 .add(SrcReg0Sub0)
8194 .add(SrcReg1Sub0);
8195
8196 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8197 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8198 .add(SrcReg0Sub1)
8199 .add(SrcReg1Sub1);
8200
8201 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8202 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8203 .addReg(DestSub0)
8204 .addImm(AMDGPU::sub0)
8205 .addReg(DestSub1)
8206 .addImm(AMDGPU::sub1);
8207
8208 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8209
8210 Worklist.insert(&LoHalf);
8211 Worklist.insert(&HiHalf);
8212
8213 // Move all users of this moved value.
8214 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8215}
8216
8217void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8218 MachineInstr &Inst,
8219 MachineDominatorTree *MDT) const {
8220 MachineBasicBlock &MBB = *Inst.getParent();
8222
8223 MachineOperand &Dest = Inst.getOperand(0);
8224 MachineOperand &Src0 = Inst.getOperand(1);
8225 MachineOperand &Src1 = Inst.getOperand(2);
8226 const DebugLoc &DL = Inst.getDebugLoc();
8227
8228 MachineBasicBlock::iterator MII = Inst;
8229
8230 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8231
8232 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8233
8234 MachineOperand* Op0;
8235 MachineOperand* Op1;
8236
8237 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8238 Op0 = &Src0;
8239 Op1 = &Src1;
8240 } else {
8241 Op0 = &Src1;
8242 Op1 = &Src0;
8243 }
8244
8245 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8246 .add(*Op0);
8247
8248 Register NewDest = MRI.createVirtualRegister(DestRC);
8249
8250 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8251 .addReg(Interm)
8252 .add(*Op1);
8253
8254 MRI.replaceRegWith(Dest.getReg(), NewDest);
8255
8256 Worklist.insert(&Xor);
8257}
8258
8259void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8260 MachineInstr &Inst) const {
8261 MachineBasicBlock &MBB = *Inst.getParent();
8263
8264 MachineBasicBlock::iterator MII = Inst;
8265 const DebugLoc &DL = Inst.getDebugLoc();
8266
8267 MachineOperand &Dest = Inst.getOperand(0);
8268 MachineOperand &Src = Inst.getOperand(1);
8269
8270 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8271 const TargetRegisterClass *SrcRC = Src.isReg() ?
8272 MRI.getRegClass(Src.getReg()) :
8273 &AMDGPU::SGPR_32RegClass;
8274
8275 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8276 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8277
8278 const TargetRegisterClass *SrcSubRC =
8279 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8280
8281 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8282 AMDGPU::sub0, SrcSubRC);
8283 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8284 AMDGPU::sub1, SrcSubRC);
8285
8286 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8287
8288 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8289
8290 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8291
8292 // We don't need to legalize operands here. src0 for either instruction can be
8293 // an SGPR, and the second input is unused or determined here.
8294 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8295}
8296
8297void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8298 MachineInstr &Inst) const {
8299 MachineBasicBlock &MBB = *Inst.getParent();
8301 MachineBasicBlock::iterator MII = Inst;
8302 const DebugLoc &DL = Inst.getDebugLoc();
8303
8304 MachineOperand &Dest = Inst.getOperand(0);
8305 uint32_t Imm = Inst.getOperand(2).getImm();
8306 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8307 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8308
8309 (void) Offset;
8310
8311 // Only sext_inreg cases handled.
8312 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8313 Offset == 0 && "Not implemented");
8314
8315 if (BitWidth < 32) {
8316 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8317 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8318 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8319
8320 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8321 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8322 .addImm(0)
8323 .addImm(BitWidth);
8324
8325 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8326 .addImm(31)
8327 .addReg(MidRegLo);
8328
8329 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8330 .addReg(MidRegLo)
8331 .addImm(AMDGPU::sub0)
8332 .addReg(MidRegHi)
8333 .addImm(AMDGPU::sub1);
8334
8335 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8336 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8337 return;
8338 }
8339
8340 MachineOperand &Src = Inst.getOperand(1);
8341 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8342 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8343
8344 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8345 .addImm(31)
8346 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8347
8348 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8349 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8350 .addImm(AMDGPU::sub0)
8351 .addReg(TmpReg)
8352 .addImm(AMDGPU::sub1);
8353
8354 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8355 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8356}
8357
8358void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8359 MachineInstr &Inst, unsigned Opcode,
8360 MachineDominatorTree *MDT) const {
8361 // (S_FLBIT_I32_B64 hi:lo) ->
8362 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8363 // (S_FF1_I32_B64 hi:lo) ->
8364 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8365
8366 MachineBasicBlock &MBB = *Inst.getParent();
8368 MachineBasicBlock::iterator MII = Inst;
8369 const DebugLoc &DL = Inst.getDebugLoc();
8370
8371 MachineOperand &Dest = Inst.getOperand(0);
8372 MachineOperand &Src = Inst.getOperand(1);
8373
8374 const MCInstrDesc &InstDesc = get(Opcode);
8375
8376 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8377 unsigned OpcodeAdd =
8378 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8379
8380 const TargetRegisterClass *SrcRC =
8381 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8382 const TargetRegisterClass *SrcSubRC =
8383 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8384
8385 MachineOperand SrcRegSub0 =
8386 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8387 MachineOperand SrcRegSub1 =
8388 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8389
8390 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8391 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8392 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8393 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8394
8395 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8396
8397 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8398
8399 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8400 .addReg(IsCtlz ? MidReg1 : MidReg2)
8401 .addImm(32)
8402 .addImm(1); // enable clamp
8403
8404 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8405 .addReg(MidReg3)
8406 .addReg(IsCtlz ? MidReg2 : MidReg1);
8407
8408 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8409
8410 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8411}
8412
8413void SIInstrInfo::addUsersToMoveToVALUWorklist(
8415 SIInstrWorklist &Worklist) const {
8416 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8417 E = MRI.use_end(); I != E;) {
8418 MachineInstr &UseMI = *I->getParent();
8419
8420 unsigned OpNo = 0;
8421
8422 switch (UseMI.getOpcode()) {
8423 case AMDGPU::COPY:
8424 case AMDGPU::WQM:
8425 case AMDGPU::SOFT_WQM:
8426 case AMDGPU::STRICT_WWM:
8427 case AMDGPU::STRICT_WQM:
8428 case AMDGPU::REG_SEQUENCE:
8429 case AMDGPU::PHI:
8430 case AMDGPU::INSERT_SUBREG:
8431 break;
8432 default:
8433 OpNo = I.getOperandNo();
8434 break;
8435 }
8436
8437 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8438 Worklist.insert(&UseMI);
8439
8440 do {
8441 ++I;
8442 } while (I != E && I->getParent() == &UseMI);
8443 } else {
8444 ++I;
8445 }
8446 }
8447}
8448
8449void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8451 MachineInstr &Inst) const {
8452 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8454 MachineOperand &Src0 = Inst.getOperand(1);
8455 MachineOperand &Src1 = Inst.getOperand(2);
8456 const DebugLoc &DL = Inst.getDebugLoc();
8457
8458 switch (Inst.getOpcode()) {
8459 case AMDGPU::S_PACK_LL_B32_B16: {
8460 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8461 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8462
8463 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8464 // 0.
8465 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8466 .addImm(0xffff);
8467
8468 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8469 .addReg(ImmReg, RegState::Kill)
8470 .add(Src0);
8471
8472 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8473 .add(Src1)
8474 .addImm(16)
8475 .addReg(TmpReg, RegState::Kill);
8476 break;
8477 }
8478 case AMDGPU::S_PACK_LH_B32_B16: {
8479 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8480 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8481 .addImm(0xffff);
8482 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8483 .addReg(ImmReg, RegState::Kill)
8484 .add(Src0)
8485 .add(Src1);
8486 break;
8487 }
8488 case AMDGPU::S_PACK_HL_B32_B16: {
8489 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8490 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8491 .addImm(16)
8492 .add(Src0);
8493 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8494 .add(Src1)
8495 .addImm(16)
8496 .addReg(TmpReg, RegState::Kill);
8497 break;
8498 }
8499 case AMDGPU::S_PACK_HH_B32_B16: {
8500 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8501 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8502 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8503 .addImm(16)
8504 .add(Src0);
8505 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8506 .addImm(0xffff0000);
8507 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8508 .add(Src1)
8509 .addReg(ImmReg, RegState::Kill)
8510 .addReg(TmpReg, RegState::Kill);
8511 break;
8512 }
8513 default:
8514 llvm_unreachable("unhandled s_pack_* instruction");
8515 }
8516
8517 MachineOperand &Dest = Inst.getOperand(0);
8518 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8519 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8520}
8521
8522void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8523 MachineInstr &SCCDefInst,
8524 SIInstrWorklist &Worklist,
8525 Register NewCond) const {
8526
8527 // Ensure that def inst defines SCC, which is still live.
8528 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8529 !Op.isDead() && Op.getParent() == &SCCDefInst);
8530 SmallVector<MachineInstr *, 4> CopyToDelete;
8531 // This assumes that all the users of SCC are in the same block
8532 // as the SCC def.
8533 for (MachineInstr &MI : // Skip the def inst itself.
8534 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8535 SCCDefInst.getParent()->end())) {
8536 // Check if SCC is used first.
8537 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
8538 if (SCCIdx != -1) {
8539 if (MI.isCopy()) {
8540 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8541 Register DestReg = MI.getOperand(0).getReg();
8542
8543 MRI.replaceRegWith(DestReg, NewCond);
8544 CopyToDelete.push_back(&MI);
8545 } else {
8546
8547 if (NewCond.isValid())
8548 MI.getOperand(SCCIdx).setReg(NewCond);
8549
8550 Worklist.insert(&MI);
8551 }
8552 }
8553 // Exit if we find another SCC def.
8554 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8555 break;
8556 }
8557 for (auto &Copy : CopyToDelete)
8558 Copy->eraseFromParent();
8559}
8560
8561// Instructions that use SCC may be converted to VALU instructions. When that
8562// happens, the SCC register is changed to VCC_LO. The instruction that defines
8563// SCC must be changed to an instruction that defines VCC. This function makes
8564// sure that the instruction that defines SCC is added to the moveToVALU
8565// worklist.
8566void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8567 SIInstrWorklist &Worklist) const {
8568 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8569 // then there is nothing to do because the defining instruction has been
8570 // converted to a VALU already. If SCC then that instruction needs to be
8571 // converted to a VALU.
8572 for (MachineInstr &MI :
8573 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8574 SCCUseInst->getParent()->rend())) {
8575 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8576 break;
8577 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8578 Worklist.insert(&MI);
8579 break;
8580 }
8581 }
8582}
8583
8584const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8585 const MachineInstr &Inst) const {
8586 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8587
8588 switch (Inst.getOpcode()) {
8589 // For target instructions, getOpRegClass just returns the virtual register
8590 // class associated with the operand, so we need to find an equivalent VGPR
8591 // register class in order to move the instruction to the VALU.
8592 case AMDGPU::COPY:
8593 case AMDGPU::PHI:
8594 case AMDGPU::REG_SEQUENCE:
8595 case AMDGPU::INSERT_SUBREG:
8596 case AMDGPU::WQM:
8597 case AMDGPU::SOFT_WQM:
8598 case AMDGPU::STRICT_WWM:
8599 case AMDGPU::STRICT_WQM: {
8600 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8601 if (RI.isAGPRClass(SrcRC)) {
8602 if (RI.isAGPRClass(NewDstRC))
8603 return nullptr;
8604
8605 switch (Inst.getOpcode()) {
8606 case AMDGPU::PHI:
8607 case AMDGPU::REG_SEQUENCE:
8608 case AMDGPU::INSERT_SUBREG:
8609 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8610 break;
8611 default:
8612 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8613 }
8614
8615 if (!NewDstRC)
8616 return nullptr;
8617 } else {
8618 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8619 return nullptr;
8620
8621 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8622 if (!NewDstRC)
8623 return nullptr;
8624 }
8625
8626 return NewDstRC;
8627 }
8628 default:
8629 return NewDstRC;
8630 }
8631}
8632
8633// Find the one SGPR operand we are allowed to use.
8634Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8635 int OpIndices[3]) const {
8636 const MCInstrDesc &Desc = MI.getDesc();
8637
8638 // Find the one SGPR operand we are allowed to use.
8639 //
8640 // First we need to consider the instruction's operand requirements before
8641 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8642 // of VCC, but we are still bound by the constant bus requirement to only use
8643 // one.
8644 //
8645 // If the operand's class is an SGPR, we can never move it.
8646
8647 Register SGPRReg = findImplicitSGPRRead(MI);
8648 if (SGPRReg)
8649 return SGPRReg;
8650
8651 Register UsedSGPRs[3] = {Register()};
8652 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8653
8654 for (unsigned i = 0; i < 3; ++i) {
8655 int Idx = OpIndices[i];
8656 if (Idx == -1)
8657 break;
8658
8659 const MachineOperand &MO = MI.getOperand(Idx);
8660 if (!MO.isReg())
8661 continue;
8662
8663 // Is this operand statically required to be an SGPR based on the operand
8664 // constraints?
8665 const TargetRegisterClass *OpRC =
8666 RI.getRegClass(Desc.operands()[Idx].RegClass);
8667 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8668 if (IsRequiredSGPR)
8669 return MO.getReg();
8670
8671 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8672 Register Reg = MO.getReg();
8673 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8674 if (RI.isSGPRClass(RegRC))
8675 UsedSGPRs[i] = Reg;
8676 }
8677
8678 // We don't have a required SGPR operand, so we have a bit more freedom in
8679 // selecting operands to move.
8680
8681 // Try to select the most used SGPR. If an SGPR is equal to one of the
8682 // others, we choose that.
8683 //
8684 // e.g.
8685 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8686 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8687
8688 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8689 // prefer those.
8690
8691 if (UsedSGPRs[0]) {
8692 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8693 SGPRReg = UsedSGPRs[0];
8694 }
8695
8696 if (!SGPRReg && UsedSGPRs[1]) {
8697 if (UsedSGPRs[1] == UsedSGPRs[2])
8698 SGPRReg = UsedSGPRs[1];
8699 }
8700
8701 return SGPRReg;
8702}
8703
8705 unsigned OperandName) const {
8706 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8707 if (Idx == -1)
8708 return nullptr;
8709
8710 return &MI.getOperand(Idx);
8711}
8712
8718 return (Format << 44) |
8719 (1ULL << 56) | // RESOURCE_LEVEL = 1
8720 (3ULL << 60); // OOB_SELECT = 3
8721 }
8722
8723 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8724 if (ST.isAmdHsaOS()) {
8725 // Set ATC = 1. GFX9 doesn't have this bit.
8727 RsrcDataFormat |= (1ULL << 56);
8728
8729 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8730 // BTW, it disables TC L2 and therefore decreases performance.
8732 RsrcDataFormat |= (2ULL << 59);
8733 }
8734
8735 return RsrcDataFormat;
8736}
8737
8741 0xffffffff; // Size;
8742
8743 // GFX9 doesn't have ELEMENT_SIZE.
8745 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8746 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8747 }
8748
8749 // IndexStride = 64 / 32.
8750 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
8751 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8752
8753 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8754 // Clear them unless we want a huge stride.
8757 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8758
8759 return Rsrc23;
8760}
8761
8763 unsigned Opc = MI.getOpcode();
8764
8765 return isSMRD(Opc);
8766}
8767
8769 return get(Opc).mayLoad() &&
8770 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8771}
8772
8774 int &FrameIndex) const {
8775 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8776 if (!Addr || !Addr->isFI())
8777 return Register();
8778
8779 assert(!MI.memoperands_empty() &&
8780 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8781
8782 FrameIndex = Addr->getIndex();
8783 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8784}
8785
8787 int &FrameIndex) const {
8788 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8789 assert(Addr && Addr->isFI());
8790 FrameIndex = Addr->getIndex();
8791 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8792}
8793
8795 int &FrameIndex) const {
8796 if (!MI.mayLoad())
8797 return Register();
8798
8799 if (isMUBUF(MI) || isVGPRSpill(MI))
8800 return isStackAccess(MI, FrameIndex);
8801
8802 if (isSGPRSpill(MI))
8803 return isSGPRStackAccess(MI, FrameIndex);
8804
8805 return Register();
8806}
8807
8809 int &FrameIndex) const {
8810 if (!MI.mayStore())
8811 return Register();
8812
8813 if (isMUBUF(MI) || isVGPRSpill(MI))
8814 return isStackAccess(MI, FrameIndex);
8815
8816 if (isSGPRSpill(MI))
8817 return isSGPRStackAccess(MI, FrameIndex);
8818
8819 return Register();
8820}
8821
8823 unsigned Size = 0;
8825 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8826 while (++I != E && I->isInsideBundle()) {
8827 assert(!I->isBundle() && "No nested bundle!");
8829 }
8830
8831 return Size;
8832}
8833
8835 unsigned Opc = MI.getOpcode();
8837 unsigned DescSize = Desc.getSize();
8838
8839 // If we have a definitive size, we can use it. Otherwise we need to inspect
8840 // the operands to know the size.
8841 if (isFixedSize(MI)) {
8842 unsigned Size = DescSize;
8843
8844 // If we hit the buggy offset, an extra nop will be inserted in MC so
8845 // estimate the worst case.
8846 if (MI.isBranch() && ST.hasOffset3fBug())
8847 Size += 4;
8848
8849 return Size;
8850 }
8851
8852 // Instructions may have a 32-bit literal encoded after them. Check
8853 // operands that could ever be literals.
8854 if (isVALU(MI) || isSALU(MI)) {
8855 if (isDPP(MI))
8856 return DescSize;
8857 bool HasLiteral = false;
8858 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8859 const MachineOperand &Op = MI.getOperand(I);
8860 const MCOperandInfo &OpInfo = Desc.operands()[I];
8861 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8862 HasLiteral = true;
8863 break;
8864 }
8865 }
8866 return HasLiteral ? DescSize + 4 : DescSize;
8867 }
8868
8869 // Check whether we have extra NSA words.
8870 if (isMIMG(MI)) {
8871 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8872 if (VAddr0Idx < 0)
8873 return 8;
8874
8875 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8876 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8877 }
8878
8879 switch (Opc) {
8880 case TargetOpcode::BUNDLE:
8881 return getInstBundleSize(MI);
8882 case TargetOpcode::INLINEASM:
8883 case TargetOpcode::INLINEASM_BR: {
8884 const MachineFunction *MF = MI.getParent()->getParent();
8885 const char *AsmStr = MI.getOperand(0).getSymbolName();
8886 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8887 }
8888 default:
8889 if (MI.isMetaInstruction())
8890 return 0;
8891 return DescSize;
8892 }
8893}
8894
8896 if (!isFLAT(MI))
8897 return false;
8898
8899 if (MI.memoperands_empty())
8900 return true;
8901
8902 for (const MachineMemOperand *MMO : MI.memoperands()) {
8903 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8904 return true;
8905 }
8906 return false;
8907}
8908
8911 static const std::pair<int, const char *> TargetIndices[] = {
8912 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8913 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8914 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8915 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8916 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8917 return ArrayRef(TargetIndices);
8918}
8919
8920/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8921/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8924 const ScheduleDAG *DAG) const {
8925 return new GCNHazardRecognizer(DAG->MF);
8926}
8927
8928/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8929/// pass.
8932 return new GCNHazardRecognizer(MF);
8933}
8934
8935// Called during:
8936// - pre-RA scheduling and post-RA scheduling
8939 const ScheduleDAGMI *DAG) const {
8940 // Borrowed from Arm Target
8941 // We would like to restrict this hazard recognizer to only
8942 // post-RA scheduling; we can tell that we're post-RA because we don't
8943 // track VRegLiveness.
8944 if (!DAG->hasVRegLiveness())
8945 return new GCNHazardRecognizer(DAG->MF);
8947}
8948
8949std::pair<unsigned, unsigned>
8951 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
8952}
8953
8956 static const std::pair<unsigned, const char *> TargetFlags[] = {
8957 { MO_GOTPCREL, "amdgpu-gotprel" },
8958 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
8959 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
8960 { MO_REL32_LO, "amdgpu-rel32-lo" },
8961 { MO_REL32_HI, "amdgpu-rel32-hi" },
8962 { MO_ABS32_LO, "amdgpu-abs32-lo" },
8963 { MO_ABS32_HI, "amdgpu-abs32-hi" },
8964 };
8965
8966 return ArrayRef(TargetFlags);
8967}
8968
8971 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8972 {
8973 {MONoClobber, "amdgpu-noclobber"},
8974 {MOLastUse, "amdgpu-last-use"},
8975 };
8976
8977 return ArrayRef(TargetFlags);
8978}
8979
8981 const MachineFunction &MF) const {
8983 assert(SrcReg.isVirtual());
8984 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8985 return AMDGPU::WWM_COPY;
8986
8987 return AMDGPU::COPY;
8988}
8989
8991 Register Reg) const {
8992 // We need to handle instructions which may be inserted during register
8993 // allocation to handle the prolog. The initial prolog instruction may have
8994 // been separated from the start of the block by spills and copies inserted
8995 // needed by the prolog. However, the insertions for scalar registers can
8996 // always be placed at the BB top as they are independent of the exec mask
8997 // value.
8998 const MachineFunction *MF = MI.getParent()->getParent();
8999 bool IsNullOrVectorRegister = true;
9000 if (Reg) {
9001 const MachineRegisterInfo &MRI = MF->getRegInfo();
9002 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9003 }
9004
9005 uint16_t Opcode = MI.getOpcode();
9007 return IsNullOrVectorRegister &&
9008 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9009 (Opcode == AMDGPU::IMPLICIT_DEF &&
9010 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9011 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9012 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9013}
9014
9018 const DebugLoc &DL,
9019 Register DestReg) const {
9020 if (ST.hasAddNoCarry())
9021 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9022
9024 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9025 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9026
9027 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9028 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9029}
9030
9033 const DebugLoc &DL,
9034 Register DestReg,
9035 RegScavenger &RS) const {
9036 if (ST.hasAddNoCarry())
9037 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9038
9039 // If available, prefer to use vcc.
9040 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9041 ? Register(RI.getVCC())
9043 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9044 0, /* AllowSpill */ false);
9045
9046 // TODO: Users need to deal with this.
9047 if (!UnusedCarry.isValid())
9048 return MachineInstrBuilder();
9049
9050 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9051 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9052}
9053
9054bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9055 switch (Opcode) {
9056 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9057 case AMDGPU::SI_KILL_I1_TERMINATOR:
9058 return true;
9059 default:
9060 return false;
9061 }
9062}
9063
9065 switch (Opcode) {
9066 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9067 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9068 case AMDGPU::SI_KILL_I1_PSEUDO:
9069 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9070 default:
9071 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9072 }
9073}
9074
9075bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9076 return Imm <= getMaxMUBUFImmOffset(ST);
9077}
9078
9080 // GFX12 field is non-negative 24-bit signed byte offset.
9081 const unsigned OffsetBits =
9082 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9083 return (1 << OffsetBits) - 1;
9084}
9085
9087 if (!ST.isWave32())
9088 return;
9089
9090 if (MI.isInlineAsm())
9091 return;
9092
9093 for (auto &Op : MI.implicit_operands()) {
9094 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9095 Op.setReg(AMDGPU::VCC_LO);
9096 }
9097}
9098
9100 if (!isSMRD(MI))
9101 return false;
9102
9103 // Check that it is using a buffer resource.
9104 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9105 if (Idx == -1) // e.g. s_memtime
9106 return false;
9107
9108 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9109 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9110}
9111
9112// Given Imm, split it into the values to put into the SOffset and ImmOffset
9113// fields in an MUBUF instruction. Return false if it is not possible (due to a
9114// hardware bug needing a workaround).
9115//
9116// The required alignment ensures that individual address components remain
9117// aligned if they are aligned to begin with. It also ensures that additional
9118// offsets within the given alignment can be added to the resulting ImmOffset.
9120 uint32_t &ImmOffset, Align Alignment) const {
9121 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9122 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9123 uint32_t Overflow = 0;
9124
9125 if (Imm > MaxImm) {
9126 if (Imm <= MaxImm + 64) {
9127 // Use an SOffset inline constant for 4..64
9128 Overflow = Imm - MaxImm;
9129 Imm = MaxImm;
9130 } else {
9131 // Try to keep the same value in SOffset for adjacent loads, so that
9132 // the corresponding register contents can be re-used.
9133 //
9134 // Load values with all low-bits (except for alignment bits) set into
9135 // SOffset, so that a larger range of values can be covered using
9136 // s_movk_i32.
9137 //
9138 // Atomic operations fail to work correctly when individual address
9139 // components are unaligned, even if their sum is aligned.
9140 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9141 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9142 Imm = Low;
9143 Overflow = High - Alignment.value();
9144 }
9145 }
9146
9147 if (Overflow > 0) {
9148 // There is a hardware bug in SI and CI which prevents address clamping in
9149 // MUBUF instructions from working correctly with SOffsets. The immediate
9150 // offset is unaffected.
9152 return false;
9153
9154 // It is not possible to set immediate in SOffset field on some targets.
9155 if (ST.hasRestrictedSOffset())
9156 return false;
9157 }
9158
9159 ImmOffset = Imm;
9160 SOffset = Overflow;
9161 return true;
9162}
9163
9164// Depending on the used address space and instructions, some immediate offsets
9165// are allowed and some are not.
9166// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9167// scratch instruction offsets can also be negative. On GFX12, offsets can be
9168// negative for all variants.
9169//
9170// There are several bugs related to these offsets:
9171// On gfx10.1, flat instructions that go into the global address space cannot
9172// use an offset.
9173//
9174// For scratch instructions, the address can be either an SGPR or a VGPR.
9175// The following offsets can be used, depending on the architecture (x means
9176// cannot be used):
9177// +----------------------------+------+------+
9178// | Address-Mode | SGPR | VGPR |
9179// +----------------------------+------+------+
9180// | gfx9 | | |
9181// | negative, 4-aligned offset | x | ok |
9182// | negative, unaligned offset | x | ok |
9183// +----------------------------+------+------+
9184// | gfx10 | | |
9185// | negative, 4-aligned offset | ok | ok |
9186// | negative, unaligned offset | ok | x |
9187// +----------------------------+------+------+
9188// | gfx10.3 | | |
9189// | negative, 4-aligned offset | ok | ok |
9190// | negative, unaligned offset | ok | ok |
9191// +----------------------------+------+------+
9192//
9193// This function ignores the addressing mode, so if an offset cannot be used in
9194// one addressing mode, it is considered illegal.
9195bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9196 uint64_t FlatVariant) const {
9197 // TODO: Should 0 be special cased?
9198 if (!ST.hasFlatInstOffsets())
9199 return false;
9200
9201 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9202 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9203 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9204 return false;
9205
9207 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9208 (Offset % 4) != 0) {
9209 return false;
9210 }
9211
9212 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9213 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9214 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9215}
9216
9217// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9218std::pair<int64_t, int64_t>
9219SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9220 uint64_t FlatVariant) const {
9221 int64_t RemainderOffset = COffsetVal;
9222 int64_t ImmField = 0;
9223
9224 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9225 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9226
9227 if (AllowNegative) {
9228 // Use signed division by a power of two to truncate towards 0.
9229 int64_t D = 1LL << NumBits;
9230 RemainderOffset = (COffsetVal / D) * D;
9231 ImmField = COffsetVal - RemainderOffset;
9232
9234 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9235 (ImmField % 4) != 0) {
9236 // Make ImmField a multiple of 4
9237 RemainderOffset += ImmField % 4;
9238 ImmField -= ImmField % 4;
9239 }
9240 } else if (COffsetVal >= 0) {
9241 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9242 RemainderOffset = COffsetVal - ImmField;
9243 }
9244
9245 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9246 assert(RemainderOffset + ImmField == COffsetVal);
9247 return {ImmField, RemainderOffset};
9248}
9249
9251 if (ST.hasNegativeScratchOffsetBug() &&
9252 FlatVariant == SIInstrFlags::FlatScratch)
9253 return false;
9254
9255 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9256}
9257
9258static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9259 switch (ST.getGeneration()) {
9260 default:
9261 break;
9264 return SIEncodingFamily::SI;
9267 return SIEncodingFamily::VI;
9274 }
9275 llvm_unreachable("Unknown subtarget generation!");
9276}
9277
9278bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9279 switch(MCOp) {
9280 // These opcodes use indirect register addressing so
9281 // they need special handling by codegen (currently missing).
9282 // Therefore it is too risky to allow these opcodes
9283 // to be selected by dpp combiner or sdwa peepholer.
9284 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9285 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9286 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9287 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9288 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9289 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9290 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9291 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9292 return true;
9293 default:
9294 return false;
9295 }
9296}
9297
9298#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9299 case OPCODE##_dpp: \
9300 case OPCODE##_e32: \
9301 case OPCODE##_e64: \
9302 case OPCODE##_e64_dpp: \
9303 case OPCODE##_sdwa:
9304
9305static bool isRenamedInGFX9(int Opcode) {
9306 switch (Opcode) {
9307 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9308 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9309 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9310 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9311 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9312 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9313 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9314 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9315 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9316 //
9317 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
9318 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
9319 case AMDGPU::V_FMA_F16_gfx9_e64:
9320 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
9321 case AMDGPU::V_INTERP_P2_F16:
9322 case AMDGPU::V_MAD_F16_e64:
9323 case AMDGPU::V_MAD_U16_e64:
9324 case AMDGPU::V_MAD_I16_e64:
9325 return true;
9326 default:
9327 return false;
9328 }
9329}
9330
9331int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9332 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9333
9334 unsigned Gen = subtargetEncodingFamily(ST);
9335
9338
9339 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9340 // subtarget has UnpackedD16VMem feature.
9341 // TODO: remove this when we discard GFX80 encoding.
9342 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9344
9345 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9346 switch (ST.getGeneration()) {
9347 default:
9349 break;
9352 break;
9355 break;
9356 }
9357 }
9358
9359 if (isMAI(Opcode)) {
9360 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9361 if (MFMAOp != -1)
9362 Opcode = MFMAOp;
9363 }
9364
9365 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9366
9367 // -1 means that Opcode is already a native instruction.
9368 if (MCOp == -1)
9369 return Opcode;
9370
9371 if (ST.hasGFX90AInsts()) {
9372 uint16_t NMCOp = (uint16_t)-1;
9373 if (ST.hasGFX940Insts())
9375 if (NMCOp == (uint16_t)-1)
9377 if (NMCOp == (uint16_t)-1)
9379 if (NMCOp != (uint16_t)-1)
9380 MCOp = NMCOp;
9381 }
9382
9383 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9384 // no encoding in the given subtarget generation.
9385 if (MCOp == (uint16_t)-1)
9386 return -1;
9387
9388 if (isAsmOnlyOpcode(MCOp))
9389 return -1;
9390
9391 return MCOp;
9392}
9393
9394static
9396 assert(RegOpnd.isReg());
9397 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9398 getRegSubRegPair(RegOpnd);
9399}
9400
9403 assert(MI.isRegSequence());
9404 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9405 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9406 auto &RegOp = MI.getOperand(1 + 2 * I);
9407 return getRegOrUndef(RegOp);
9408 }
9410}
9411
9412// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9413// Following a subreg of reg:subreg isn't supported
9416 if (!RSR.SubReg)
9417 return false;
9418 switch (MI.getOpcode()) {
9419 default: break;
9420 case AMDGPU::REG_SEQUENCE:
9421 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9422 return true;
9423 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9424 case AMDGPU::INSERT_SUBREG:
9425 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9426 // inserted the subreg we're looking for
9427 RSR = getRegOrUndef(MI.getOperand(2));
9428 else { // the subreg in the rest of the reg
9429 auto R1 = getRegOrUndef(MI.getOperand(1));
9430 if (R1.SubReg) // subreg of subreg isn't supported
9431 return false;
9432 RSR.Reg = R1.Reg;
9433 }
9434 return true;
9435 }
9436 return false;
9437}
9438
9441 assert(MRI.isSSA());
9442 if (!P.Reg.isVirtual())
9443 return nullptr;
9444
9445 auto RSR = P;
9446 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9447 while (auto *MI = DefInst) {
9448 DefInst = nullptr;
9449 switch (MI->getOpcode()) {
9450 case AMDGPU::COPY:
9451 case AMDGPU::V_MOV_B32_e32: {
9452 auto &Op1 = MI->getOperand(1);
9453 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9454 if (Op1.isUndef())
9455 return nullptr;
9456 RSR = getRegSubRegPair(Op1);
9457 DefInst = MRI.getVRegDef(RSR.Reg);
9458 }
9459 break;
9460 }
9461 default:
9462 if (followSubRegDef(*MI, RSR)) {
9463 if (!RSR.Reg)
9464 return nullptr;
9465 DefInst = MRI.getVRegDef(RSR.Reg);
9466 }
9467 }
9468 if (!DefInst)
9469 return MI;
9470 }
9471 return nullptr;
9472}
9473
9475 Register VReg,
9476 const MachineInstr &DefMI,
9477 const MachineInstr &UseMI) {
9478 assert(MRI.isSSA() && "Must be run on SSA");
9479
9480 auto *TRI = MRI.getTargetRegisterInfo();
9481 auto *DefBB = DefMI.getParent();
9482
9483 // Don't bother searching between blocks, although it is possible this block
9484 // doesn't modify exec.
9485 if (UseMI.getParent() != DefBB)
9486 return true;
9487
9488 const int MaxInstScan = 20;
9489 int NumInst = 0;
9490
9491 // Stop scan at the use.
9492 auto E = UseMI.getIterator();
9493 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9494 if (I->isDebugInstr())
9495 continue;
9496
9497 if (++NumInst > MaxInstScan)
9498 return true;
9499
9500 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9501 return true;
9502 }
9503
9504 return false;
9505}
9506
9508 Register VReg,
9509 const MachineInstr &DefMI) {
9510 assert(MRI.isSSA() && "Must be run on SSA");
9511
9512 auto *TRI = MRI.getTargetRegisterInfo();
9513 auto *DefBB = DefMI.getParent();
9514
9515 const int MaxUseScan = 10;
9516 int NumUse = 0;
9517
9518 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9519 auto &UseInst = *Use.getParent();
9520 // Don't bother searching between blocks, although it is possible this block
9521 // doesn't modify exec.
9522 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9523 return true;
9524
9525 if (++NumUse > MaxUseScan)
9526 return true;
9527 }
9528
9529 if (NumUse == 0)
9530 return false;
9531
9532 const int MaxInstScan = 20;
9533 int NumInst = 0;
9534
9535 // Stop scan when we have seen all the uses.
9536 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9537 assert(I != DefBB->end());
9538
9539 if (I->isDebugInstr())
9540 continue;
9541
9542 if (++NumInst > MaxInstScan)
9543 return true;
9544
9545 for (const MachineOperand &Op : I->operands()) {
9546 // We don't check reg masks here as they're used only on calls:
9547 // 1. EXEC is only considered const within one BB
9548 // 2. Call should be a terminator instruction if present in a BB
9549
9550 if (!Op.isReg())
9551 continue;
9552
9553 Register Reg = Op.getReg();
9554 if (Op.isUse()) {
9555 if (Reg == VReg && --NumUse == 0)
9556 return false;
9557 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9558 return true;
9559 }
9560 }
9561}
9562
9565 const DebugLoc &DL, Register Src, Register Dst) const {
9566 auto Cur = MBB.begin();
9567 if (Cur != MBB.end())
9568 do {
9569 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9570 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9571 ++Cur;
9572 } while (Cur != MBB.end() && Cur != LastPHIIt);
9573
9574 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9575 Dst);
9576}
9577
9580 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9581 if (InsPt != MBB.end() &&
9582 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9583 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9584 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9585 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9586 InsPt++;
9587 return BuildMI(MBB, InsPt, DL,
9588 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9589 : AMDGPU::S_MOV_B64_term),
9590 Dst)
9591 .addReg(Src, 0, SrcSubReg)
9592 .addReg(AMDGPU::EXEC, RegState::Implicit);
9593 }
9594 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9595 Dst);
9596}
9597
9598bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9599
9602 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9603 VirtRegMap *VRM) const {
9604 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9605 //
9606 // %0:sreg_32 = COPY $m0
9607 //
9608 // We explicitly chose SReg_32 for the virtual register so such a copy might
9609 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9610 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9611 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9612 // TargetInstrInfo::foldMemoryOperand() is going to try.
9613 // A similar issue also exists with spilling and reloading $exec registers.
9614 //
9615 // To prevent that, constrain the %0 register class here.
9616 if (isFullCopyInstr(MI)) {
9617 Register DstReg = MI.getOperand(0).getReg();
9618 Register SrcReg = MI.getOperand(1).getReg();
9619 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9620 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9622 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9623 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9624 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9625 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9626 return nullptr;
9627 }
9628 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9629 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9630 return nullptr;
9631 }
9632 }
9633 }
9634
9635 return nullptr;
9636}
9637
9639 const MachineInstr &MI,
9640 unsigned *PredCost) const {
9641 if (MI.isBundle()) {
9643 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9644 unsigned Lat = 0, Count = 0;
9645 for (++I; I != E && I->isBundledWithPred(); ++I) {
9646 ++Count;
9647 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9648 }
9649 return Lat + Count - 1;
9650 }
9651
9652 return SchedModel.computeInstrLatency(&MI);
9653}
9654
9657 unsigned opcode = MI.getOpcode();
9658 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9659 auto IID = GI->getIntrinsicID();
9664
9665 switch (IID) {
9666 case Intrinsic::amdgcn_if:
9667 case Intrinsic::amdgcn_else:
9668 // FIXME: Uniform if second result
9669 break;
9670 }
9671
9673 }
9674
9675 // Loads from the private and flat address spaces are divergent, because
9676 // threads can execute the load instruction with the same inputs and get
9677 // different results.
9678 //
9679 // All other loads are not divergent, because if threads issue loads with the
9680 // same arguments, they will always get the same result.
9681 if (opcode == AMDGPU::G_LOAD) {
9682 if (MI.memoperands_empty())
9683 return InstructionUniformity::NeverUniform; // conservative assumption
9684
9685 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9686 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9687 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9688 })) {
9689 // At least one MMO in a non-global address space.
9691 }
9693 }
9694
9695 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9696 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9697 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9698 AMDGPU::isGenericAtomic(opcode)) {
9700 }
9702}
9703
9706
9707 if (isNeverUniform(MI))
9709
9710 unsigned opcode = MI.getOpcode();
9711 if (opcode == AMDGPU::V_READLANE_B32 ||
9712 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9713 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9715
9716 if (isCopyInstr(MI)) {
9717 const MachineOperand &srcOp = MI.getOperand(1);
9718 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9719 const TargetRegisterClass *regClass =
9720 RI.getPhysRegBaseClass(srcOp.getReg());
9723 }
9725 }
9726
9727 // GMIR handling
9728 if (MI.isPreISelOpcode())
9730
9731 // Atomics are divergent because they are executed sequentially: when an
9732 // atomic operation refers to the same address in each thread, then each
9733 // thread after the first sees the value written by the previous thread as
9734 // original value.
9735
9736 if (isAtomic(MI))
9738
9739 // Loads from the private and flat address spaces are divergent, because
9740 // threads can execute the load instruction with the same inputs and get
9741 // different results.
9742 if (isFLAT(MI) && MI.mayLoad()) {
9743 if (MI.memoperands_empty())
9744 return InstructionUniformity::NeverUniform; // conservative assumption
9745
9746 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9747 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9748 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9749 })) {
9750 // At least one MMO in a non-global address space.
9752 }
9753
9755 }
9756
9757 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9758 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9759
9760 // FIXME: It's conceptually broken to report this for an instruction, and not
9761 // a specific def operand. For inline asm in particular, there could be mixed
9762 // uniform and divergent results.
9763 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9764 const MachineOperand &SrcOp = MI.getOperand(I);
9765 if (!SrcOp.isReg())
9766 continue;
9767
9768 Register Reg = SrcOp.getReg();
9769 if (!Reg || !SrcOp.readsReg())
9770 continue;
9771
9772 // If RegBank is null, this is unassigned or an unallocatable special
9773 // register, which are all scalars.
9774 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9775 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9777 }
9778
9779 // TODO: Uniformity check condtions above can be rearranged for more
9780 // redability
9781
9782 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9783 // currently turned into no-op COPYs by SelectionDAG ISel and are
9784 // therefore no longer recognizable.
9785
9787}
9788
9790 switch (MF.getFunction().getCallingConv()) {
9792 return 1;
9794 return 2;
9796 return 3;
9800 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9803 case CallingConv::C:
9804 case CallingConv::Fast:
9805 default:
9806 // Assume other calling conventions are various compute callable functions
9807 return 0;
9808 }
9809}
9810
9812 Register &SrcReg2, int64_t &CmpMask,
9813 int64_t &CmpValue) const {
9814 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9815 return false;
9816
9817 switch (MI.getOpcode()) {
9818 default:
9819 break;
9820 case AMDGPU::S_CMP_EQ_U32:
9821 case AMDGPU::S_CMP_EQ_I32:
9822 case AMDGPU::S_CMP_LG_U32:
9823 case AMDGPU::S_CMP_LG_I32:
9824 case AMDGPU::S_CMP_LT_U32:
9825 case AMDGPU::S_CMP_LT_I32:
9826 case AMDGPU::S_CMP_GT_U32:
9827 case AMDGPU::S_CMP_GT_I32:
9828 case AMDGPU::S_CMP_LE_U32:
9829 case AMDGPU::S_CMP_LE_I32:
9830 case AMDGPU::S_CMP_GE_U32:
9831 case AMDGPU::S_CMP_GE_I32:
9832 case AMDGPU::S_CMP_EQ_U64:
9833 case AMDGPU::S_CMP_LG_U64:
9834 SrcReg = MI.getOperand(0).getReg();
9835 if (MI.getOperand(1).isReg()) {
9836 if (MI.getOperand(1).getSubReg())
9837 return false;
9838 SrcReg2 = MI.getOperand(1).getReg();
9839 CmpValue = 0;
9840 } else if (MI.getOperand(1).isImm()) {
9841 SrcReg2 = Register();
9842 CmpValue = MI.getOperand(1).getImm();
9843 } else {
9844 return false;
9845 }
9846 CmpMask = ~0;
9847 return true;
9848 case AMDGPU::S_CMPK_EQ_U32:
9849 case AMDGPU::S_CMPK_EQ_I32:
9850 case AMDGPU::S_CMPK_LG_U32:
9851 case AMDGPU::S_CMPK_LG_I32:
9852 case AMDGPU::S_CMPK_LT_U32:
9853 case AMDGPU::S_CMPK_LT_I32:
9854 case AMDGPU::S_CMPK_GT_U32:
9855 case AMDGPU::S_CMPK_GT_I32:
9856 case AMDGPU::S_CMPK_LE_U32:
9857 case AMDGPU::S_CMPK_LE_I32:
9858 case AMDGPU::S_CMPK_GE_U32:
9859 case AMDGPU::S_CMPK_GE_I32:
9860 SrcReg = MI.getOperand(0).getReg();
9861 SrcReg2 = Register();
9862 CmpValue = MI.getOperand(1).getImm();
9863 CmpMask = ~0;
9864 return true;
9865 }
9866
9867 return false;
9868}
9869
9871 Register SrcReg2, int64_t CmpMask,
9872 int64_t CmpValue,
9873 const MachineRegisterInfo *MRI) const {
9874 if (!SrcReg || SrcReg.isPhysical())
9875 return false;
9876
9877 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9878 return false;
9879
9880 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9881 this](int64_t ExpectedValue, unsigned SrcSize,
9882 bool IsReversible, bool IsSigned) -> bool {
9883 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9884 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9885 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9886 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9887 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9888 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9889 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9890 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9891 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9892 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9893 //
9894 // Signed ge/gt are not used for the sign bit.
9895 //
9896 // If result of the AND is unused except in the compare:
9897 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9898 //
9899 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9900 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9901 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9902 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9903 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9904 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9905
9906 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9907 if (!Def || Def->getParent() != CmpInstr.getParent())
9908 return false;
9909
9910 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9911 Def->getOpcode() != AMDGPU::S_AND_B64)
9912 return false;
9913
9914 int64_t Mask;
9915 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9916 if (MO->isImm())
9917 Mask = MO->getImm();
9918 else if (!getFoldableImm(MO, Mask))
9919 return false;
9920 Mask &= maxUIntN(SrcSize);
9921 return isPowerOf2_64(Mask);
9922 };
9923
9924 MachineOperand *SrcOp = &Def->getOperand(1);
9925 if (isMask(SrcOp))
9926 SrcOp = &Def->getOperand(2);
9927 else if (isMask(&Def->getOperand(2)))
9928 SrcOp = &Def->getOperand(1);
9929 else
9930 return false;
9931
9932 // A valid Mask is required to have a single bit set, hence a non-zero and
9933 // power-of-two value. This verifies that we will not do 64-bit shift below.
9934 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
9935 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
9936 if (IsSigned && BitNo == SrcSize - 1)
9937 return false;
9938
9939 ExpectedValue <<= BitNo;
9940
9941 bool IsReversedCC = false;
9942 if (CmpValue != ExpectedValue) {
9943 if (!IsReversible)
9944 return false;
9945 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
9946 if (!IsReversedCC)
9947 return false;
9948 }
9949
9950 Register DefReg = Def->getOperand(0).getReg();
9951 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
9952 return false;
9953
9954 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
9955 I != E; ++I) {
9956 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
9957 I->killsRegister(AMDGPU::SCC, &RI))
9958 return false;
9959 }
9960
9961 MachineOperand *SccDef =
9962 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
9963 SccDef->setIsDead(false);
9964 CmpInstr.eraseFromParent();
9965
9966 if (!MRI->use_nodbg_empty(DefReg)) {
9967 assert(!IsReversedCC);
9968 return true;
9969 }
9970
9971 // Replace AND with unused result with a S_BITCMP.
9972 MachineBasicBlock *MBB = Def->getParent();
9973
9974 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
9975 : AMDGPU::S_BITCMP1_B32
9976 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
9977 : AMDGPU::S_BITCMP1_B64;
9978
9979 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
9980 .add(*SrcOp)
9981 .addImm(BitNo);
9982 Def->eraseFromParent();
9983
9984 return true;
9985 };
9986
9987 switch (CmpInstr.getOpcode()) {
9988 default:
9989 break;
9990 case AMDGPU::S_CMP_EQ_U32:
9991 case AMDGPU::S_CMP_EQ_I32:
9992 case AMDGPU::S_CMPK_EQ_U32:
9993 case AMDGPU::S_CMPK_EQ_I32:
9994 return optimizeCmpAnd(1, 32, true, false);
9995 case AMDGPU::S_CMP_GE_U32:
9996 case AMDGPU::S_CMPK_GE_U32:
9997 return optimizeCmpAnd(1, 32, false, false);
9998 case AMDGPU::S_CMP_GE_I32:
9999 case AMDGPU::S_CMPK_GE_I32:
10000 return optimizeCmpAnd(1, 32, false, true);
10001 case AMDGPU::S_CMP_EQ_U64:
10002 return optimizeCmpAnd(1, 64, true, false);
10003 case AMDGPU::S_CMP_LG_U32:
10004 case AMDGPU::S_CMP_LG_I32:
10005 case AMDGPU::S_CMPK_LG_U32:
10006 case AMDGPU::S_CMPK_LG_I32:
10007 return optimizeCmpAnd(0, 32, true, false);
10008 case AMDGPU::S_CMP_GT_U32:
10009 case AMDGPU::S_CMPK_GT_U32:
10010 return optimizeCmpAnd(0, 32, false, false);
10011 case AMDGPU::S_CMP_GT_I32:
10012 case AMDGPU::S_CMPK_GT_I32:
10013 return optimizeCmpAnd(0, 32, false, true);
10014 case AMDGPU::S_CMP_LG_U64:
10015 return optimizeCmpAnd(0, 64, true, false);
10016 }
10017
10018 return false;
10019}
10020
10022 unsigned OpName) const {
10023 if (!ST.needsAlignedVGPRs())
10024 return;
10025
10026 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10027 if (OpNo < 0)
10028 return;
10029 MachineOperand &Op = MI.getOperand(OpNo);
10030 if (getOpSize(MI, OpNo) > 4)
10031 return;
10032
10033 // Add implicit aligned super-reg to force alignment on the data operand.
10034 const DebugLoc &DL = MI.getDebugLoc();
10035 MachineBasicBlock *BB = MI.getParent();
10037 Register DataReg = Op.getReg();
10038 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10039 Register Undef = MRI.createVirtualRegister(
10040 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10041 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10042 Register NewVR =
10043 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10044 : &AMDGPU::VReg_64_Align2RegClass);
10045 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10046 .addReg(DataReg, 0, Op.getSubReg())
10047 .addImm(AMDGPU::sub0)
10048 .addReg(Undef)
10049 .addImm(AMDGPU::sub1);
10050 Op.setReg(NewVR);
10051 Op.setSubReg(AMDGPU::sub0);
10052 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10053}
10054
10056 if (isIGLP(*MI))
10057 return false;
10058
10060}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
TargetInstrInfo::RegSubRegPair RegSubRegPair
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillRestoreOpcode(unsigned Size)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillSaveOpcode(unsigned Size)
static bool resultDependsOnExec(const MachineInstr &MI)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:82
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static constexpr unsigned ModifierOpNames[]
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:73
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:171
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:759
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:763
bool hasScalarCompareEq64() const
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:401
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasVALUMaskWriteHazard() const
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:313
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:775
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:694
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:767
bool hasVALUReadSGPRHazard() const
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:354
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasNoF16PseudoScalarTransInlineConstants() const
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:946
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:687
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:542
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:612
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:622
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:222
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:444
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:438
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:248
unsigned short Opcode
Definition: MCInstrDesc.h:205
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:565
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:97
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:91
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:398
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:47
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:578
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
iterator_range< mop_iterator > explicit_operands()
Definition: MachineInstr.h:697
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:821
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:806
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:788
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:499
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:705
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:392
int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:800
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:562
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:932
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1173
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
Definition: SIInstrInfo.h:973
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:644
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:552
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1303
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:657
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:504
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:520
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
void removeModOperands(MachineInstr &MI) const
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:578
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:604
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:432
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:472
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:980
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:626
static bool isF16PseudoScalarTrans(unsigned Opcode)
Definition: SIInstrInfo.h:957
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:768
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void materializeImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, int64_t Value) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:724
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
static bool isWWMRegSpillOpcode(uint16_t Opcode)
Definition: SIInstrInfo.h:756
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:1026
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description.
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:689
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:871
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:736
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:817
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:464
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:620
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:424
bool isBarrier(unsigned Opcode) const
Definition: SIInstrInfo.h:947
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1316
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:888
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:496
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:63
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
bool isVectorRegister(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:65
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:237
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1591
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isDPALU_DPP(const MCInstrDesc &OpDesc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1592
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1594
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2INT32
Definition: SIDefines.h:227
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_AC_V2FP16
Definition: SIDefines.h:246
@ OPERAND_REG_IMM_INT32
Operands with register or 32-bit immediate.
Definition: SIDefines.h:200
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16_DEFERRED
Definition: SIDefines.h:207
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:206
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_AC_BF16
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_AC_INT16
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:204
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_AC_FP16
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_AC_INT32
Definition: SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:242
@ OPERAND_REG_INLINE_AC_V2BF16
Definition: SIDefines.h:245
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:213
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:203
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:251
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:243
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:202
@ OPERAND_REG_INLINE_C_V2FP32
Definition: SIDefines.h:228
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_FP32_DEFERRED
Definition: SIDefines.h:209
@ OPERAND_REG_IMM_FP16_DEFERRED
Definition: SIDefines.h:208
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:470
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:472
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:469
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:471
@ TI_CONSTDATA_START
Definition: AMDGPU.h:468
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1593
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:70
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:68
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:67
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:69
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:60
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:59
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:66
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:71
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:296
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:43
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:260
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition: SIInstrInfo.h:39
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Error
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:219
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:210
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:83
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:51
MachineInstr * top() const
Definition: SIInstrInfo.h:56
bool empty() const
Definition: SIInstrInfo.h:66
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:75
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.