LLVM 20.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm::AMDGPU {
44#define GET_D16ImageDimIntrinsics_IMPL
45#define GET_ImageDimIntrinsicTable_IMPL
46#define GET_RsrcIntrinsics_IMPL
47#include "AMDGPUGenSearchableTables.inc"
48} // namespace llvm::AMDGPU
49
50// Must be at least 4 to be able to branch over minimum unconditional branch
51// code. This is only for making it possible to write reasonably small tests for
52// long branches.
54BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
55 cl::desc("Restrict range of branch instructions (DEBUG)"));
56
58 "amdgpu-fix-16-bit-physreg-copies",
59 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
60 cl::init(true),
62
64 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
65 RI(ST), ST(ST) {
66 SchedModel.init(&ST);
67}
68
69//===----------------------------------------------------------------------===//
70// TargetInstrInfo callbacks
71//===----------------------------------------------------------------------===//
72
73static unsigned getNumOperandsNoGlue(SDNode *Node) {
74 unsigned N = Node->getNumOperands();
75 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
76 --N;
77 return N;
78}
79
80/// Returns true if both nodes have the same value for the given
81/// operand \p Op, or if both nodes do not have this operand.
82static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
83 unsigned Opc0 = N0->getMachineOpcode();
84 unsigned Opc1 = N1->getMachineOpcode();
85
86 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
87 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
88
89 if (Op0Idx == -1 && Op1Idx == -1)
90 return true;
91
92
93 if ((Op0Idx == -1 && Op1Idx != -1) ||
94 (Op1Idx == -1 && Op0Idx != -1))
95 return false;
96
97 // getNamedOperandIdx returns the index for the MachineInstr's operands,
98 // which includes the result as the first operand. We are indexing into the
99 // MachineSDNode's operands, so we need to skip the result operand to get
100 // the real index.
101 --Op0Idx;
102 --Op1Idx;
103
104 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
105}
106
107static bool canRemat(const MachineInstr &MI) {
108
112 return true;
113
114 if (SIInstrInfo::isSMRD(MI)) {
115 return !MI.memoperands_empty() &&
116 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
117 return MMO->isLoad() && MMO->isInvariant();
118 });
119 }
120
121 return false;
122}
123
125 const MachineInstr &MI) const {
126
127 if (canRemat(MI)) {
128 // Normally VALU use of exec would block the rematerialization, but that
129 // is OK in this case to have an implicit exec read as all VALU do.
130 // We really want all of the generic logic for this except for this.
131
132 // Another potential implicit use is mode register. The core logic of
133 // the RA will not attempt rematerialization if mode is set anywhere
134 // in the function, otherwise it is safe since mode is not changed.
135
136 // There is difference to generic method which does not allow
137 // rematerialization if there are virtual register uses. We allow this,
138 // therefore this method includes SOP instructions as well.
139 if (!MI.hasImplicitDef() &&
140 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
141 !MI.mayRaiseFPException())
142 return true;
143 }
144
146}
147
148// Returns true if the scalar result of a VALU instruction depends on exec.
150 // Ignore comparisons which are only used masked with exec.
151 // This allows some hoisting/sinking of VALU comparisons.
152 if (MI.isCompare()) {
153 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
154 Register DstReg = MI.getOperand(0).getReg();
155 if (!DstReg.isVirtual())
156 return true;
157 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
158 switch (Use.getOpcode()) {
159 case AMDGPU::S_AND_SAVEEXEC_B32:
160 case AMDGPU::S_AND_SAVEEXEC_B64:
161 break;
162 case AMDGPU::S_AND_B32:
163 case AMDGPU::S_AND_B64:
164 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
165 return true;
166 break;
167 default:
168 return true;
169 }
170 }
171 return false;
172 }
173
174 switch (MI.getOpcode()) {
175 default:
176 break;
177 case AMDGPU::V_READFIRSTLANE_B32:
178 return true;
179 }
180
181 return false;
182}
183
185 // Any implicit use of exec by VALU is not a real register read.
186 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
188}
189
191 MachineBasicBlock *SuccToSinkTo,
192 MachineCycleInfo *CI) const {
193 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
194 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
195 return true;
196
197 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
198 // Check if sinking of MI would create temporal divergent use.
199 for (auto Op : MI.uses()) {
200 if (Op.isReg() && Op.getReg().isVirtual() &&
201 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
202 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
203
204 // SgprDef defined inside cycle
205 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
206 if (FromCycle == nullptr)
207 continue;
208
209 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
210 // Check if there is a FromCycle that contains SgprDef's basic block but
211 // does not contain SuccToSinkTo and also has divergent exit condition.
212 while (FromCycle && !FromCycle->contains(ToCycle)) {
214 FromCycle->getExitingBlocks(ExitingBlocks);
215
216 // FromCycle has divergent exit condition.
217 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
218 if (hasDivergentBranch(ExitingBlock))
219 return false;
220 }
221
222 FromCycle = FromCycle->getParentCycle();
223 }
224 }
225 }
226
227 return true;
228}
229
231 int64_t &Offset0,
232 int64_t &Offset1) const {
233 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
234 return false;
235
236 unsigned Opc0 = Load0->getMachineOpcode();
237 unsigned Opc1 = Load1->getMachineOpcode();
238
239 // Make sure both are actually loads.
240 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
241 return false;
242
243 // A mayLoad instruction without a def is not a load. Likely a prefetch.
244 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
245 return false;
246
247 if (isDS(Opc0) && isDS(Opc1)) {
248
249 // FIXME: Handle this case:
250 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
251 return false;
252
253 // Check base reg.
254 if (Load0->getOperand(0) != Load1->getOperand(0))
255 return false;
256
257 // Skip read2 / write2 variants for simplicity.
258 // TODO: We should report true if the used offsets are adjacent (excluded
259 // st64 versions).
260 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
261 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
262 if (Offset0Idx == -1 || Offset1Idx == -1)
263 return false;
264
265 // XXX - be careful of dataless loads
266 // getNamedOperandIdx returns the index for MachineInstrs. Since they
267 // include the output in the operand list, but SDNodes don't, we need to
268 // subtract the index by one.
269 Offset0Idx -= get(Opc0).NumDefs;
270 Offset1Idx -= get(Opc1).NumDefs;
271 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
272 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
273 return true;
274 }
275
276 if (isSMRD(Opc0) && isSMRD(Opc1)) {
277 // Skip time and cache invalidation instructions.
278 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
279 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
280 return false;
281
282 unsigned NumOps = getNumOperandsNoGlue(Load0);
283 if (NumOps != getNumOperandsNoGlue(Load1))
284 return false;
285
286 // Check base reg.
287 if (Load0->getOperand(0) != Load1->getOperand(0))
288 return false;
289
290 // Match register offsets, if both register and immediate offsets present.
291 assert(NumOps == 4 || NumOps == 5);
292 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
293 return false;
294
295 const ConstantSDNode *Load0Offset =
296 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
297 const ConstantSDNode *Load1Offset =
298 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
299
300 if (!Load0Offset || !Load1Offset)
301 return false;
302
303 Offset0 = Load0Offset->getZExtValue();
304 Offset1 = Load1Offset->getZExtValue();
305 return true;
306 }
307
308 // MUBUF and MTBUF can access the same addresses.
309 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
310
311 // MUBUF and MTBUF have vaddr at different indices.
312 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
313 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
314 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
315 return false;
316
317 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
318 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
319
320 if (OffIdx0 == -1 || OffIdx1 == -1)
321 return false;
322
323 // getNamedOperandIdx returns the index for MachineInstrs. Since they
324 // include the output in the operand list, but SDNodes don't, we need to
325 // subtract the index by one.
326 OffIdx0 -= get(Opc0).NumDefs;
327 OffIdx1 -= get(Opc1).NumDefs;
328
329 SDValue Off0 = Load0->getOperand(OffIdx0);
330 SDValue Off1 = Load1->getOperand(OffIdx1);
331
332 // The offset might be a FrameIndexSDNode.
333 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
334 return false;
335
336 Offset0 = Off0->getAsZExtVal();
337 Offset1 = Off1->getAsZExtVal();
338 return true;
339 }
340
341 return false;
342}
343
344static bool isStride64(unsigned Opc) {
345 switch (Opc) {
346 case AMDGPU::DS_READ2ST64_B32:
347 case AMDGPU::DS_READ2ST64_B64:
348 case AMDGPU::DS_WRITE2ST64_B32:
349 case AMDGPU::DS_WRITE2ST64_B64:
350 return true;
351 default:
352 return false;
353 }
354}
355
358 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
359 const TargetRegisterInfo *TRI) const {
360 if (!LdSt.mayLoadOrStore())
361 return false;
362
363 unsigned Opc = LdSt.getOpcode();
364 OffsetIsScalable = false;
365 const MachineOperand *BaseOp, *OffsetOp;
366 int DataOpIdx;
367
368 if (isDS(LdSt)) {
369 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
370 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
371 if (OffsetOp) {
372 // Normal, single offset LDS instruction.
373 if (!BaseOp) {
374 // DS_CONSUME/DS_APPEND use M0 for the base address.
375 // TODO: find the implicit use operand for M0 and use that as BaseOp?
376 return false;
377 }
378 BaseOps.push_back(BaseOp);
379 Offset = OffsetOp->getImm();
380 // Get appropriate operand, and compute width accordingly.
381 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
382 if (DataOpIdx == -1)
383 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
384 Width = getOpSize(LdSt, DataOpIdx);
385 } else {
386 // The 2 offset instructions use offset0 and offset1 instead. We can treat
387 // these as a load with a single offset if the 2 offsets are consecutive.
388 // We will use this for some partially aligned loads.
389 const MachineOperand *Offset0Op =
390 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
391 const MachineOperand *Offset1Op =
392 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
393
394 unsigned Offset0 = Offset0Op->getImm() & 0xff;
395 unsigned Offset1 = Offset1Op->getImm() & 0xff;
396 if (Offset0 + 1 != Offset1)
397 return false;
398
399 // Each of these offsets is in element sized units, so we need to convert
400 // to bytes of the individual reads.
401
402 unsigned EltSize;
403 if (LdSt.mayLoad())
404 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
405 else {
406 assert(LdSt.mayStore());
407 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
408 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
409 }
410
411 if (isStride64(Opc))
412 EltSize *= 64;
413
414 BaseOps.push_back(BaseOp);
415 Offset = EltSize * Offset0;
416 // Get appropriate operand(s), and compute width accordingly.
417 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
418 if (DataOpIdx == -1) {
419 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 Width = getOpSize(LdSt, DataOpIdx);
421 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
422 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
423 } else {
424 Width = getOpSize(LdSt, DataOpIdx);
425 }
426 }
427 return true;
428 }
429
430 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
431 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
432 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
433 return false;
434 BaseOps.push_back(RSrc);
435 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
436 if (BaseOp && !BaseOp->isFI())
437 BaseOps.push_back(BaseOp);
438 const MachineOperand *OffsetImm =
439 getNamedOperand(LdSt, AMDGPU::OpName::offset);
440 Offset = OffsetImm->getImm();
441 const MachineOperand *SOffset =
442 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
443 if (SOffset) {
444 if (SOffset->isReg())
445 BaseOps.push_back(SOffset);
446 else
447 Offset += SOffset->getImm();
448 }
449 // Get appropriate operand, and compute width accordingly.
450 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
451 if (DataOpIdx == -1)
452 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
453 if (DataOpIdx == -1) // LDS DMA
454 return false;
455 Width = getOpSize(LdSt, DataOpIdx);
456 return true;
457 }
458
459 if (isImage(LdSt)) {
460 auto RsrcOpName =
461 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
462 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
463 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
464 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
465 if (VAddr0Idx >= 0) {
466 // GFX10 possible NSA encoding.
467 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
468 BaseOps.push_back(&LdSt.getOperand(I));
469 } else {
470 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
471 }
472 Offset = 0;
473 // Get appropriate operand, and compute width accordingly.
474 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
475 if (DataOpIdx == -1)
476 return false; // no return sampler
477 Width = getOpSize(LdSt, DataOpIdx);
478 return true;
479 }
480
481 if (isSMRD(LdSt)) {
482 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
483 if (!BaseOp) // e.g. S_MEMTIME
484 return false;
485 BaseOps.push_back(BaseOp);
486 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
487 Offset = OffsetOp ? OffsetOp->getImm() : 0;
488 // Get appropriate operand, and compute width accordingly.
489 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
490 if (DataOpIdx == -1)
491 return false;
492 Width = getOpSize(LdSt, DataOpIdx);
493 return true;
494 }
495
496 if (isFLAT(LdSt)) {
497 // Instructions have either vaddr or saddr or both or none.
498 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
499 if (BaseOp)
500 BaseOps.push_back(BaseOp);
501 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
502 if (BaseOp)
503 BaseOps.push_back(BaseOp);
504 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
505 // Get appropriate operand, and compute width accordingly.
506 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
507 if (DataOpIdx == -1)
508 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
509 if (DataOpIdx == -1) // LDS DMA
510 return false;
511 Width = getOpSize(LdSt, DataOpIdx);
512 return true;
513 }
514
515 return false;
516}
517
518static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
520 const MachineInstr &MI2,
522 // Only examine the first "base" operand of each instruction, on the
523 // assumption that it represents the real base address of the memory access.
524 // Other operands are typically offsets or indices from this base address.
525 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
526 return true;
527
528 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
529 return false;
530
531 auto *MO1 = *MI1.memoperands_begin();
532 auto *MO2 = *MI2.memoperands_begin();
533 if (MO1->getAddrSpace() != MO2->getAddrSpace())
534 return false;
535
536 const auto *Base1 = MO1->getValue();
537 const auto *Base2 = MO2->getValue();
538 if (!Base1 || !Base2)
539 return false;
540 Base1 = getUnderlyingObject(Base1);
541 Base2 = getUnderlyingObject(Base2);
542
543 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
544 return false;
545
546 return Base1 == Base2;
547}
548
550 int64_t Offset1, bool OffsetIsScalable1,
552 int64_t Offset2, bool OffsetIsScalable2,
553 unsigned ClusterSize,
554 unsigned NumBytes) const {
555 // If the mem ops (to be clustered) do not have the same base ptr, then they
556 // should not be clustered
557 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
558 if (!BaseOps1.empty() && !BaseOps2.empty()) {
559 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
560 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
561 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
562 return false;
563
564 const SIMachineFunctionInfo *MFI =
565 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
566 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
567 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
568 // If only one base op is empty, they do not have the same base ptr
569 return false;
570 }
571
572 // In order to avoid register pressure, on an average, the number of DWORDS
573 // loaded together by all clustered mem ops should not exceed
574 // MaxMemoryClusterDWords. This is an empirical value based on certain
575 // observations and performance related experiments.
576 // The good thing about this heuristic is - it avoids clustering of too many
577 // sub-word loads, and also avoids clustering of wide loads. Below is the
578 // brief summary of how the heuristic behaves for various `LoadSize` when
579 // MaxMemoryClusterDWords is 8.
580 //
581 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
582 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
583 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
584 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
585 // (5) LoadSize >= 17: do not cluster
586 const unsigned LoadSize = NumBytes / ClusterSize;
587 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
588 return NumDWords <= MaxMemoryClusterDWords;
589}
590
591// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
592// the first 16 loads will be interleaved with the stores, and the next 16 will
593// be clustered as expected. It should really split into 2 16 store batches.
594//
595// Loads are clustered until this returns false, rather than trying to schedule
596// groups of stores. This also means we have to deal with saying different
597// address space loads should be clustered, and ones which might cause bank
598// conflicts.
599//
600// This might be deprecated so it might not be worth that much effort to fix.
602 int64_t Offset0, int64_t Offset1,
603 unsigned NumLoads) const {
604 assert(Offset1 > Offset0 &&
605 "Second offset should be larger than first offset!");
606 // If we have less than 16 loads in a row, and the offsets are within 64
607 // bytes, then schedule together.
608
609 // A cacheline is 64 bytes (for global memory).
610 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
611}
612
615 const DebugLoc &DL, MCRegister DestReg,
616 MCRegister SrcReg, bool KillSrc,
617 const char *Msg = "illegal VGPR to SGPR copy") {
619 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
621 C.diagnose(IllegalCopy);
622
623 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
624 .addReg(SrcReg, getKillRegState(KillSrc));
625}
626
627/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
628/// possible to have a direct copy in these cases on GFX908, so an intermediate
629/// VGPR copy is required.
633 const DebugLoc &DL, MCRegister DestReg,
634 MCRegister SrcReg, bool KillSrc,
635 RegScavenger &RS, bool RegsOverlap,
636 Register ImpDefSuperReg = Register(),
637 Register ImpUseSuperReg = Register()) {
638 assert((TII.getSubtarget().hasMAIInsts() &&
639 !TII.getSubtarget().hasGFX90AInsts()) &&
640 "Expected GFX908 subtarget.");
641
642 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
643 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
644 "Source register of the copy should be either an SGPR or an AGPR.");
645
646 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
647 "Destination register of the copy should be an AGPR.");
648
649 const SIRegisterInfo &RI = TII.getRegisterInfo();
650
651 // First try to find defining accvgpr_write to avoid temporary registers.
652 // In the case of copies of overlapping AGPRs, we conservatively do not
653 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
654 // an accvgpr_write used for this same copy due to implicit-defs
655 if (!RegsOverlap) {
656 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
657 --Def;
658
659 if (!Def->modifiesRegister(SrcReg, &RI))
660 continue;
661
662 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
663 Def->getOperand(0).getReg() != SrcReg)
664 break;
665
666 MachineOperand &DefOp = Def->getOperand(1);
667 assert(DefOp.isReg() || DefOp.isImm());
668
669 if (DefOp.isReg()) {
670 bool SafeToPropagate = true;
671 // Check that register source operand is not clobbered before MI.
672 // Immediate operands are always safe to propagate.
673 for (auto I = Def; I != MI && SafeToPropagate; ++I)
674 if (I->modifiesRegister(DefOp.getReg(), &RI))
675 SafeToPropagate = false;
676
677 if (!SafeToPropagate)
678 break;
679
680 DefOp.setIsKill(false);
681 }
682
683 MachineInstrBuilder Builder =
684 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
685 .add(DefOp);
686 if (ImpDefSuperReg)
687 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
688
689 if (ImpUseSuperReg) {
690 Builder.addReg(ImpUseSuperReg,
692 }
693
694 return;
695 }
696 }
697
699 RS.backward(std::next(MI));
700
701 // Ideally we want to have three registers for a long reg_sequence copy
702 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
703 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
704 *MBB.getParent());
705
706 // Registers in the sequence are allocated contiguously so we can just
707 // use register number to pick one of three round-robin temps.
708 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
709 Register Tmp =
710 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
712 "VGPR used for an intermediate copy should have been reserved.");
713
714 // Only loop through if there are any free registers left. We don't want to
715 // spill.
716 while (RegNo--) {
717 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
718 /* RestoreAfter */ false, 0,
719 /* AllowSpill */ false);
720 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
721 break;
722 Tmp = Tmp2;
723 RS.setRegUsed(Tmp);
724 }
725
726 // Insert copy to temporary VGPR.
727 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
728 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
729 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
730 } else {
731 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
732 }
733
734 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
735 .addReg(SrcReg, getKillRegState(KillSrc));
736 if (ImpUseSuperReg) {
737 UseBuilder.addReg(ImpUseSuperReg,
739 }
740
741 MachineInstrBuilder DefBuilder
742 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
743 .addReg(Tmp, RegState::Kill);
744
745 if (ImpDefSuperReg)
746 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
747}
748
751 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
752 const TargetRegisterClass *RC, bool Forward) {
753 const SIRegisterInfo &RI = TII.getRegisterInfo();
754 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
756 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
757
758 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
759 int16_t SubIdx = BaseIndices[Idx];
760 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
761 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
762 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
763 unsigned Opcode = AMDGPU::S_MOV_B32;
764
765 // Is SGPR aligned? If so try to combine with next.
766 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
767 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
768 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
769 // Can use SGPR64 copy
770 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
771 SubIdx = RI.getSubRegFromChannel(Channel, 2);
772 DestSubReg = RI.getSubReg(DestReg, SubIdx);
773 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
774 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
775 Opcode = AMDGPU::S_MOV_B64;
776 Idx++;
777 }
778
779 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
780 .addReg(SrcSubReg)
781 .addReg(SrcReg, RegState::Implicit);
782
783 if (!FirstMI)
784 FirstMI = LastMI;
785
786 if (!Forward)
787 I--;
788 }
789
790 assert(FirstMI && LastMI);
791 if (!Forward)
792 std::swap(FirstMI, LastMI);
793
794 FirstMI->addOperand(
795 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
796
797 if (KillSrc)
798 LastMI->addRegisterKilled(SrcReg, &RI);
799}
800
803 const DebugLoc &DL, MCRegister DestReg,
804 MCRegister SrcReg, bool KillSrc,
805 bool RenamableDest, bool RenamableSrc) const {
806 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
807 unsigned Size = RI.getRegSizeInBits(*RC);
808 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
809 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
810
811 // The rest of copyPhysReg assumes Src and Dst size are the same size.
812 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
813 // we remove Fix16BitCopies and this code block?
814 if (Fix16BitCopies) {
815 if (((Size == 16) != (SrcSize == 16))) {
816 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
818 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
819 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
820 RegToFix = SubReg;
821
822 if (DestReg == SrcReg) {
823 // Identity copy. Insert empty bundle since ExpandPostRA expects an
824 // instruction here.
825 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
826 return;
827 }
828 RC = RI.getPhysRegBaseClass(DestReg);
829 Size = RI.getRegSizeInBits(*RC);
830 SrcRC = RI.getPhysRegBaseClass(SrcReg);
831 SrcSize = RI.getRegSizeInBits(*SrcRC);
832 }
833 }
834
835 if (RC == &AMDGPU::VGPR_32RegClass) {
836 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
837 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
838 AMDGPU::AGPR_32RegClass.contains(SrcReg));
839 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
840 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
841 BuildMI(MBB, MI, DL, get(Opc), DestReg)
842 .addReg(SrcReg, getKillRegState(KillSrc));
843 return;
844 }
845
846 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
847 RC == &AMDGPU::SReg_32RegClass) {
848 if (SrcReg == AMDGPU::SCC) {
849 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
850 .addImm(1)
851 .addImm(0);
852 return;
853 }
854
855 if (DestReg == AMDGPU::VCC_LO) {
856 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
857 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
858 .addReg(SrcReg, getKillRegState(KillSrc));
859 } else {
860 // FIXME: Hack until VReg_1 removed.
861 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
862 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
863 .addImm(0)
864 .addReg(SrcReg, getKillRegState(KillSrc));
865 }
866
867 return;
868 }
869
870 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
871 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
872 return;
873 }
874
875 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
876 .addReg(SrcReg, getKillRegState(KillSrc));
877 return;
878 }
879
880 if (RC == &AMDGPU::SReg_64RegClass) {
881 if (SrcReg == AMDGPU::SCC) {
882 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
883 .addImm(1)
884 .addImm(0);
885 return;
886 }
887
888 if (DestReg == AMDGPU::VCC) {
889 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
890 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
891 .addReg(SrcReg, getKillRegState(KillSrc));
892 } else {
893 // FIXME: Hack until VReg_1 removed.
894 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
895 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
896 .addImm(0)
897 .addReg(SrcReg, getKillRegState(KillSrc));
898 }
899
900 return;
901 }
902
903 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
904 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
905 return;
906 }
907
908 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
909 .addReg(SrcReg, getKillRegState(KillSrc));
910 return;
911 }
912
913 if (DestReg == AMDGPU::SCC) {
914 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
915 // but SelectionDAG emits such copies for i1 sources.
916 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
917 // This copy can only be produced by patterns
918 // with explicit SCC, which are known to be enabled
919 // only for subtargets with S_CMP_LG_U64 present.
921 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
922 .addReg(SrcReg, getKillRegState(KillSrc))
923 .addImm(0);
924 } else {
925 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
926 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
927 .addReg(SrcReg, getKillRegState(KillSrc))
928 .addImm(0);
929 }
930
931 return;
932 }
933
934 if (RC == &AMDGPU::AGPR_32RegClass) {
935 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
936 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
937 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
938 .addReg(SrcReg, getKillRegState(KillSrc));
939 return;
940 }
941
942 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
943 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
944 .addReg(SrcReg, getKillRegState(KillSrc));
945 return;
946 }
947
948 // FIXME: Pass should maintain scavenger to avoid scan through the block on
949 // every AGPR spill.
950 RegScavenger RS;
951 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
952 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
953 return;
954 }
955
956 if (Size == 16) {
957 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
958 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
959 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
960
961 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
962 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
963 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
964 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
965 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
966 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
967 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
968 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
969
970 if (IsSGPRDst) {
971 if (!IsSGPRSrc) {
972 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
973 return;
974 }
975
976 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
977 .addReg(NewSrcReg, getKillRegState(KillSrc));
978 return;
979 }
980
981 if (IsAGPRDst || IsAGPRSrc) {
982 if (!DstLow || !SrcLow) {
983 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
984 "Cannot use hi16 subreg with an AGPR!");
985 }
986
987 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
988 return;
989 }
990
991 if (ST.hasTrue16BitInsts()) {
992 if (IsSGPRSrc) {
993 assert(SrcLow);
994 SrcReg = NewSrcReg;
995 }
996 // Use the smaller instruction encoding if possible.
997 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
998 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
999 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1000 .addReg(SrcReg);
1001 } else {
1002 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1003 .addImm(0) // src0_modifiers
1004 .addReg(SrcReg)
1005 .addImm(0); // op_sel
1006 }
1007 return;
1008 }
1009
1010 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1011 if (!DstLow || !SrcLow) {
1012 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1013 "Cannot use hi16 subreg on VI!");
1014 }
1015
1016 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1017 .addReg(NewSrcReg, getKillRegState(KillSrc));
1018 return;
1019 }
1020
1021 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1022 .addImm(0) // src0_modifiers
1023 .addReg(NewSrcReg)
1024 .addImm(0) // clamp
1031 // First implicit operand is $exec.
1032 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1033 return;
1034 }
1035
1036 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1037 if (ST.hasMovB64()) {
1038 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1039 .addReg(SrcReg, getKillRegState(KillSrc));
1040 return;
1041 }
1042 if (ST.hasPkMovB32()) {
1043 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1045 .addReg(SrcReg)
1047 .addReg(SrcReg)
1048 .addImm(0) // op_sel_lo
1049 .addImm(0) // op_sel_hi
1050 .addImm(0) // neg_lo
1051 .addImm(0) // neg_hi
1052 .addImm(0) // clamp
1053 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1054 return;
1055 }
1056 }
1057
1058 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1059 if (RI.isSGPRClass(RC)) {
1060 if (!RI.isSGPRClass(SrcRC)) {
1061 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1062 return;
1063 }
1064 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1065 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1066 Forward);
1067 return;
1068 }
1069
1070 unsigned EltSize = 4;
1071 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1072 if (RI.isAGPRClass(RC)) {
1073 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1074 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1075 else if (RI.hasVGPRs(SrcRC) ||
1076 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1077 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1078 else
1079 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1080 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1081 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1082 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1083 (RI.isProperlyAlignedRC(*RC) &&
1084 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1085 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1086 if (ST.hasMovB64()) {
1087 Opcode = AMDGPU::V_MOV_B64_e32;
1088 EltSize = 8;
1089 } else if (ST.hasPkMovB32()) {
1090 Opcode = AMDGPU::V_PK_MOV_B32;
1091 EltSize = 8;
1092 }
1093 }
1094
1095 // For the cases where we need an intermediate instruction/temporary register
1096 // (destination is an AGPR), we need a scavenger.
1097 //
1098 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1099 // whole block for every handled copy.
1100 std::unique_ptr<RegScavenger> RS;
1101 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1102 RS = std::make_unique<RegScavenger>();
1103
1104 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1105
1106 // If there is an overlap, we can't kill the super-register on the last
1107 // instruction, since it will also kill the components made live by this def.
1108 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1109 const bool CanKillSuperReg = KillSrc && !Overlap;
1110
1111 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1112 unsigned SubIdx;
1113 if (Forward)
1114 SubIdx = SubIndices[Idx];
1115 else
1116 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1117 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1118 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1119 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1120
1121 bool IsFirstSubreg = Idx == 0;
1122 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1123
1124 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1125 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1126 Register ImpUseSuper = SrcReg;
1127 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1128 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1129 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1131 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1133 .addReg(SrcSubReg)
1135 .addReg(SrcSubReg)
1136 .addImm(0) // op_sel_lo
1137 .addImm(0) // op_sel_hi
1138 .addImm(0) // neg_lo
1139 .addImm(0) // neg_hi
1140 .addImm(0) // clamp
1141 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1142 if (IsFirstSubreg)
1144 } else {
1145 MachineInstrBuilder Builder =
1146 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1147 if (IsFirstSubreg)
1148 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1149
1150 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1151 }
1152 }
1153}
1154
1155int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1156 int NewOpc;
1157
1158 // Try to map original to commuted opcode
1159 NewOpc = AMDGPU::getCommuteRev(Opcode);
1160 if (NewOpc != -1)
1161 // Check if the commuted (REV) opcode exists on the target.
1162 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1163
1164 // Try to map commuted to original opcode
1165 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1166 if (NewOpc != -1)
1167 // Check if the original (non-REV) opcode exists on the target.
1168 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1169
1170 return Opcode;
1171}
1172
1175 const DebugLoc &DL, Register DestReg,
1176 int64_t Value) const {
1178 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1179 if (RegClass == &AMDGPU::SReg_32RegClass ||
1180 RegClass == &AMDGPU::SGPR_32RegClass ||
1181 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1182 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1183 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1184 .addImm(Value);
1185 return;
1186 }
1187
1188 if (RegClass == &AMDGPU::SReg_64RegClass ||
1189 RegClass == &AMDGPU::SGPR_64RegClass ||
1190 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1191 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1192 .addImm(Value);
1193 return;
1194 }
1195
1196 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1197 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1198 .addImm(Value);
1199 return;
1200 }
1201 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1202 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1203 .addImm(Value);
1204 return;
1205 }
1206
1207 unsigned EltSize = 4;
1208 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1209 if (RI.isSGPRClass(RegClass)) {
1210 if (RI.getRegSizeInBits(*RegClass) > 32) {
1211 Opcode = AMDGPU::S_MOV_B64;
1212 EltSize = 8;
1213 } else {
1214 Opcode = AMDGPU::S_MOV_B32;
1215 EltSize = 4;
1216 }
1217 }
1218
1219 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1220 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1221 int64_t IdxValue = Idx == 0 ? Value : 0;
1222
1223 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1224 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1225 Builder.addImm(IdxValue);
1226 }
1227}
1228
1229const TargetRegisterClass *
1231 return &AMDGPU::VGPR_32RegClass;
1232}
1233
1236 const DebugLoc &DL, Register DstReg,
1238 Register TrueReg,
1239 Register FalseReg) const {
1241 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1242 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1243 "Not a VGPR32 reg");
1244
1245 if (Cond.size() == 1) {
1246 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1247 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1248 .add(Cond[0]);
1249 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1250 .addImm(0)
1251 .addReg(FalseReg)
1252 .addImm(0)
1253 .addReg(TrueReg)
1254 .addReg(SReg);
1255 } else if (Cond.size() == 2) {
1256 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1257 switch (Cond[0].getImm()) {
1258 case SIInstrInfo::SCC_TRUE: {
1259 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1260 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1261 : AMDGPU::S_CSELECT_B64), SReg)
1262 .addImm(1)
1263 .addImm(0);
1264 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1265 .addImm(0)
1266 .addReg(FalseReg)
1267 .addImm(0)
1268 .addReg(TrueReg)
1269 .addReg(SReg);
1270 break;
1271 }
1272 case SIInstrInfo::SCC_FALSE: {
1273 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1274 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1275 : AMDGPU::S_CSELECT_B64), SReg)
1276 .addImm(0)
1277 .addImm(1);
1278 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1279 .addImm(0)
1280 .addReg(FalseReg)
1281 .addImm(0)
1282 .addReg(TrueReg)
1283 .addReg(SReg);
1284 break;
1285 }
1286 case SIInstrInfo::VCCNZ: {
1287 MachineOperand RegOp = Cond[1];
1288 RegOp.setImplicit(false);
1289 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1290 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1291 .add(RegOp);
1292 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1293 .addImm(0)
1294 .addReg(FalseReg)
1295 .addImm(0)
1296 .addReg(TrueReg)
1297 .addReg(SReg);
1298 break;
1299 }
1300 case SIInstrInfo::VCCZ: {
1301 MachineOperand RegOp = Cond[1];
1302 RegOp.setImplicit(false);
1303 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1304 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1305 .add(RegOp);
1306 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1307 .addImm(0)
1308 .addReg(TrueReg)
1309 .addImm(0)
1310 .addReg(FalseReg)
1311 .addReg(SReg);
1312 break;
1313 }
1314 case SIInstrInfo::EXECNZ: {
1315 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1316 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1317 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1318 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1319 .addImm(0);
1320 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1321 : AMDGPU::S_CSELECT_B64), SReg)
1322 .addImm(1)
1323 .addImm(0);
1324 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1325 .addImm(0)
1326 .addReg(FalseReg)
1327 .addImm(0)
1328 .addReg(TrueReg)
1329 .addReg(SReg);
1330 break;
1331 }
1332 case SIInstrInfo::EXECZ: {
1333 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1334 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1335 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1336 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1337 .addImm(0);
1338 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1339 : AMDGPU::S_CSELECT_B64), SReg)
1340 .addImm(0)
1341 .addImm(1);
1342 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1343 .addImm(0)
1344 .addReg(FalseReg)
1345 .addImm(0)
1346 .addReg(TrueReg)
1347 .addReg(SReg);
1348 llvm_unreachable("Unhandled branch predicate EXECZ");
1349 break;
1350 }
1351 default:
1352 llvm_unreachable("invalid branch predicate");
1353 }
1354 } else {
1355 llvm_unreachable("Can only handle Cond size 1 or 2");
1356 }
1357}
1358
1361 const DebugLoc &DL,
1362 Register SrcReg, int Value) const {
1364 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1365 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1366 .addImm(Value)
1367 .addReg(SrcReg);
1368
1369 return Reg;
1370}
1371
1374 const DebugLoc &DL,
1375 Register SrcReg, int Value) const {
1377 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1378 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1379 .addImm(Value)
1380 .addReg(SrcReg);
1381
1382 return Reg;
1383}
1384
1386
1387 if (RI.isAGPRClass(DstRC))
1388 return AMDGPU::COPY;
1389 if (RI.getRegSizeInBits(*DstRC) == 16) {
1390 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1391 // before RA.
1392 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1393 }
1394 if (RI.getRegSizeInBits(*DstRC) == 32)
1395 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1396 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1397 return AMDGPU::S_MOV_B64;
1398 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1399 return AMDGPU::V_MOV_B64_PSEUDO;
1400 return AMDGPU::COPY;
1401}
1402
1403const MCInstrDesc &
1405 bool IsIndirectSrc) const {
1406 if (IsIndirectSrc) {
1407 if (VecSize <= 32) // 4 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1409 if (VecSize <= 64) // 8 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1411 if (VecSize <= 96) // 12 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1413 if (VecSize <= 128) // 16 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1415 if (VecSize <= 160) // 20 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1417 if (VecSize <= 256) // 32 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1419 if (VecSize <= 288) // 36 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1421 if (VecSize <= 320) // 40 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1423 if (VecSize <= 352) // 44 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1425 if (VecSize <= 384) // 48 bytes
1426 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1427 if (VecSize <= 512) // 64 bytes
1428 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1429 if (VecSize <= 1024) // 128 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1431
1432 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1433 }
1434
1435 if (VecSize <= 32) // 4 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1437 if (VecSize <= 64) // 8 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1439 if (VecSize <= 96) // 12 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1441 if (VecSize <= 128) // 16 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1443 if (VecSize <= 160) // 20 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1445 if (VecSize <= 256) // 32 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1447 if (VecSize <= 288) // 36 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1449 if (VecSize <= 320) // 40 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1451 if (VecSize <= 352) // 44 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1453 if (VecSize <= 384) // 48 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1455 if (VecSize <= 512) // 64 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1457 if (VecSize <= 1024) // 128 bytes
1458 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1459
1460 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1461}
1462
1463static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1464 if (VecSize <= 32) // 4 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1466 if (VecSize <= 64) // 8 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1468 if (VecSize <= 96) // 12 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1470 if (VecSize <= 128) // 16 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1472 if (VecSize <= 160) // 20 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1474 if (VecSize <= 256) // 32 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1476 if (VecSize <= 288) // 36 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1478 if (VecSize <= 320) // 40 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1480 if (VecSize <= 352) // 44 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1482 if (VecSize <= 384) // 48 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1484 if (VecSize <= 512) // 64 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1486 if (VecSize <= 1024) // 128 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1488
1489 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1490}
1491
1492static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1493 if (VecSize <= 32) // 4 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1495 if (VecSize <= 64) // 8 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1497 if (VecSize <= 96) // 12 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1499 if (VecSize <= 128) // 16 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1501 if (VecSize <= 160) // 20 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1503 if (VecSize <= 256) // 32 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1505 if (VecSize <= 288) // 36 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1507 if (VecSize <= 320) // 40 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1509 if (VecSize <= 352) // 44 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1511 if (VecSize <= 384) // 48 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1513 if (VecSize <= 512) // 64 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1515 if (VecSize <= 1024) // 128 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1517
1518 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1519}
1520
1521static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1522 if (VecSize <= 64) // 8 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1524 if (VecSize <= 128) // 16 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1526 if (VecSize <= 256) // 32 bytes
1527 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1528 if (VecSize <= 512) // 64 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1530 if (VecSize <= 1024) // 128 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1532
1533 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1534}
1535
1536const MCInstrDesc &
1537SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1538 bool IsSGPR) const {
1539 if (IsSGPR) {
1540 switch (EltSize) {
1541 case 32:
1542 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1543 case 64:
1544 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1545 default:
1546 llvm_unreachable("invalid reg indexing elt size");
1547 }
1548 }
1549
1550 assert(EltSize == 32 && "invalid reg indexing elt size");
1552}
1553
1554static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1555 switch (Size) {
1556 case 4:
1557 return AMDGPU::SI_SPILL_S32_SAVE;
1558 case 8:
1559 return AMDGPU::SI_SPILL_S64_SAVE;
1560 case 12:
1561 return AMDGPU::SI_SPILL_S96_SAVE;
1562 case 16:
1563 return AMDGPU::SI_SPILL_S128_SAVE;
1564 case 20:
1565 return AMDGPU::SI_SPILL_S160_SAVE;
1566 case 24:
1567 return AMDGPU::SI_SPILL_S192_SAVE;
1568 case 28:
1569 return AMDGPU::SI_SPILL_S224_SAVE;
1570 case 32:
1571 return AMDGPU::SI_SPILL_S256_SAVE;
1572 case 36:
1573 return AMDGPU::SI_SPILL_S288_SAVE;
1574 case 40:
1575 return AMDGPU::SI_SPILL_S320_SAVE;
1576 case 44:
1577 return AMDGPU::SI_SPILL_S352_SAVE;
1578 case 48:
1579 return AMDGPU::SI_SPILL_S384_SAVE;
1580 case 64:
1581 return AMDGPU::SI_SPILL_S512_SAVE;
1582 case 128:
1583 return AMDGPU::SI_SPILL_S1024_SAVE;
1584 default:
1585 llvm_unreachable("unknown register size");
1586 }
1587}
1588
1589static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1590 switch (Size) {
1591 case 4:
1592 return AMDGPU::SI_SPILL_V32_SAVE;
1593 case 8:
1594 return AMDGPU::SI_SPILL_V64_SAVE;
1595 case 12:
1596 return AMDGPU::SI_SPILL_V96_SAVE;
1597 case 16:
1598 return AMDGPU::SI_SPILL_V128_SAVE;
1599 case 20:
1600 return AMDGPU::SI_SPILL_V160_SAVE;
1601 case 24:
1602 return AMDGPU::SI_SPILL_V192_SAVE;
1603 case 28:
1604 return AMDGPU::SI_SPILL_V224_SAVE;
1605 case 32:
1606 return AMDGPU::SI_SPILL_V256_SAVE;
1607 case 36:
1608 return AMDGPU::SI_SPILL_V288_SAVE;
1609 case 40:
1610 return AMDGPU::SI_SPILL_V320_SAVE;
1611 case 44:
1612 return AMDGPU::SI_SPILL_V352_SAVE;
1613 case 48:
1614 return AMDGPU::SI_SPILL_V384_SAVE;
1615 case 64:
1616 return AMDGPU::SI_SPILL_V512_SAVE;
1617 case 128:
1618 return AMDGPU::SI_SPILL_V1024_SAVE;
1619 default:
1620 llvm_unreachable("unknown register size");
1621 }
1622}
1623
1624static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1625 switch (Size) {
1626 case 4:
1627 return AMDGPU::SI_SPILL_A32_SAVE;
1628 case 8:
1629 return AMDGPU::SI_SPILL_A64_SAVE;
1630 case 12:
1631 return AMDGPU::SI_SPILL_A96_SAVE;
1632 case 16:
1633 return AMDGPU::SI_SPILL_A128_SAVE;
1634 case 20:
1635 return AMDGPU::SI_SPILL_A160_SAVE;
1636 case 24:
1637 return AMDGPU::SI_SPILL_A192_SAVE;
1638 case 28:
1639 return AMDGPU::SI_SPILL_A224_SAVE;
1640 case 32:
1641 return AMDGPU::SI_SPILL_A256_SAVE;
1642 case 36:
1643 return AMDGPU::SI_SPILL_A288_SAVE;
1644 case 40:
1645 return AMDGPU::SI_SPILL_A320_SAVE;
1646 case 44:
1647 return AMDGPU::SI_SPILL_A352_SAVE;
1648 case 48:
1649 return AMDGPU::SI_SPILL_A384_SAVE;
1650 case 64:
1651 return AMDGPU::SI_SPILL_A512_SAVE;
1652 case 128:
1653 return AMDGPU::SI_SPILL_A1024_SAVE;
1654 default:
1655 llvm_unreachable("unknown register size");
1656 }
1657}
1658
1659static unsigned getAVSpillSaveOpcode(unsigned Size) {
1660 switch (Size) {
1661 case 4:
1662 return AMDGPU::SI_SPILL_AV32_SAVE;
1663 case 8:
1664 return AMDGPU::SI_SPILL_AV64_SAVE;
1665 case 12:
1666 return AMDGPU::SI_SPILL_AV96_SAVE;
1667 case 16:
1668 return AMDGPU::SI_SPILL_AV128_SAVE;
1669 case 20:
1670 return AMDGPU::SI_SPILL_AV160_SAVE;
1671 case 24:
1672 return AMDGPU::SI_SPILL_AV192_SAVE;
1673 case 28:
1674 return AMDGPU::SI_SPILL_AV224_SAVE;
1675 case 32:
1676 return AMDGPU::SI_SPILL_AV256_SAVE;
1677 case 36:
1678 return AMDGPU::SI_SPILL_AV288_SAVE;
1679 case 40:
1680 return AMDGPU::SI_SPILL_AV320_SAVE;
1681 case 44:
1682 return AMDGPU::SI_SPILL_AV352_SAVE;
1683 case 48:
1684 return AMDGPU::SI_SPILL_AV384_SAVE;
1685 case 64:
1686 return AMDGPU::SI_SPILL_AV512_SAVE;
1687 case 128:
1688 return AMDGPU::SI_SPILL_AV1024_SAVE;
1689 default:
1690 llvm_unreachable("unknown register size");
1691 }
1692}
1693
1694static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1695 bool IsVectorSuperClass) {
1696 // Currently, there is only 32-bit WWM register spills needed.
1697 if (Size != 4)
1698 llvm_unreachable("unknown wwm register spill size");
1699
1700 if (IsVectorSuperClass)
1701 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1702
1703 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1704}
1705
1707 const TargetRegisterClass *RC,
1708 unsigned Size,
1709 const SIRegisterInfo &TRI,
1710 const SIMachineFunctionInfo &MFI) {
1711 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1712
1713 // Choose the right opcode if spilling a WWM register.
1715 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1716
1717 if (IsVectorSuperClass)
1718 return getAVSpillSaveOpcode(Size);
1719
1720 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1722}
1723
1726 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1727 const TargetRegisterInfo *TRI, Register VReg,
1728 MachineInstr::MIFlag Flags) const {
1731 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1732 const DebugLoc &DL = MBB.findDebugLoc(MI);
1733
1734 MachinePointerInfo PtrInfo
1735 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1737 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1738 FrameInfo.getObjectAlign(FrameIndex));
1739 unsigned SpillSize = TRI->getSpillSize(*RC);
1740
1742 if (RI.isSGPRClass(RC)) {
1743 MFI->setHasSpilledSGPRs();
1744 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1745 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1746 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1747
1748 // We are only allowed to create one new instruction when spilling
1749 // registers, so we need to use pseudo instruction for spilling SGPRs.
1750 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1751
1752 // The SGPR spill/restore instructions only work on number sgprs, so we need
1753 // to make sure we are using the correct register class.
1754 if (SrcReg.isVirtual() && SpillSize == 4) {
1755 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1756 }
1757
1758 BuildMI(MBB, MI, DL, OpDesc)
1759 .addReg(SrcReg, getKillRegState(isKill)) // data
1760 .addFrameIndex(FrameIndex) // addr
1761 .addMemOperand(MMO)
1763
1764 if (RI.spillSGPRToVGPR())
1765 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1766 return;
1767 }
1768
1769 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1770 SpillSize, RI, *MFI);
1771 MFI->setHasSpilledVGPRs();
1772
1773 BuildMI(MBB, MI, DL, get(Opcode))
1774 .addReg(SrcReg, getKillRegState(isKill)) // data
1775 .addFrameIndex(FrameIndex) // addr
1776 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1777 .addImm(0) // offset
1778 .addMemOperand(MMO);
1779}
1780
1781static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1782 switch (Size) {
1783 case 4:
1784 return AMDGPU::SI_SPILL_S32_RESTORE;
1785 case 8:
1786 return AMDGPU::SI_SPILL_S64_RESTORE;
1787 case 12:
1788 return AMDGPU::SI_SPILL_S96_RESTORE;
1789 case 16:
1790 return AMDGPU::SI_SPILL_S128_RESTORE;
1791 case 20:
1792 return AMDGPU::SI_SPILL_S160_RESTORE;
1793 case 24:
1794 return AMDGPU::SI_SPILL_S192_RESTORE;
1795 case 28:
1796 return AMDGPU::SI_SPILL_S224_RESTORE;
1797 case 32:
1798 return AMDGPU::SI_SPILL_S256_RESTORE;
1799 case 36:
1800 return AMDGPU::SI_SPILL_S288_RESTORE;
1801 case 40:
1802 return AMDGPU::SI_SPILL_S320_RESTORE;
1803 case 44:
1804 return AMDGPU::SI_SPILL_S352_RESTORE;
1805 case 48:
1806 return AMDGPU::SI_SPILL_S384_RESTORE;
1807 case 64:
1808 return AMDGPU::SI_SPILL_S512_RESTORE;
1809 case 128:
1810 return AMDGPU::SI_SPILL_S1024_RESTORE;
1811 default:
1812 llvm_unreachable("unknown register size");
1813 }
1814}
1815
1816static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1817 switch (Size) {
1818 case 4:
1819 return AMDGPU::SI_SPILL_V32_RESTORE;
1820 case 8:
1821 return AMDGPU::SI_SPILL_V64_RESTORE;
1822 case 12:
1823 return AMDGPU::SI_SPILL_V96_RESTORE;
1824 case 16:
1825 return AMDGPU::SI_SPILL_V128_RESTORE;
1826 case 20:
1827 return AMDGPU::SI_SPILL_V160_RESTORE;
1828 case 24:
1829 return AMDGPU::SI_SPILL_V192_RESTORE;
1830 case 28:
1831 return AMDGPU::SI_SPILL_V224_RESTORE;
1832 case 32:
1833 return AMDGPU::SI_SPILL_V256_RESTORE;
1834 case 36:
1835 return AMDGPU::SI_SPILL_V288_RESTORE;
1836 case 40:
1837 return AMDGPU::SI_SPILL_V320_RESTORE;
1838 case 44:
1839 return AMDGPU::SI_SPILL_V352_RESTORE;
1840 case 48:
1841 return AMDGPU::SI_SPILL_V384_RESTORE;
1842 case 64:
1843 return AMDGPU::SI_SPILL_V512_RESTORE;
1844 case 128:
1845 return AMDGPU::SI_SPILL_V1024_RESTORE;
1846 default:
1847 llvm_unreachable("unknown register size");
1848 }
1849}
1850
1851static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1852 switch (Size) {
1853 case 4:
1854 return AMDGPU::SI_SPILL_A32_RESTORE;
1855 case 8:
1856 return AMDGPU::SI_SPILL_A64_RESTORE;
1857 case 12:
1858 return AMDGPU::SI_SPILL_A96_RESTORE;
1859 case 16:
1860 return AMDGPU::SI_SPILL_A128_RESTORE;
1861 case 20:
1862 return AMDGPU::SI_SPILL_A160_RESTORE;
1863 case 24:
1864 return AMDGPU::SI_SPILL_A192_RESTORE;
1865 case 28:
1866 return AMDGPU::SI_SPILL_A224_RESTORE;
1867 case 32:
1868 return AMDGPU::SI_SPILL_A256_RESTORE;
1869 case 36:
1870 return AMDGPU::SI_SPILL_A288_RESTORE;
1871 case 40:
1872 return AMDGPU::SI_SPILL_A320_RESTORE;
1873 case 44:
1874 return AMDGPU::SI_SPILL_A352_RESTORE;
1875 case 48:
1876 return AMDGPU::SI_SPILL_A384_RESTORE;
1877 case 64:
1878 return AMDGPU::SI_SPILL_A512_RESTORE;
1879 case 128:
1880 return AMDGPU::SI_SPILL_A1024_RESTORE;
1881 default:
1882 llvm_unreachable("unknown register size");
1883 }
1884}
1885
1886static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1887 switch (Size) {
1888 case 4:
1889 return AMDGPU::SI_SPILL_AV32_RESTORE;
1890 case 8:
1891 return AMDGPU::SI_SPILL_AV64_RESTORE;
1892 case 12:
1893 return AMDGPU::SI_SPILL_AV96_RESTORE;
1894 case 16:
1895 return AMDGPU::SI_SPILL_AV128_RESTORE;
1896 case 20:
1897 return AMDGPU::SI_SPILL_AV160_RESTORE;
1898 case 24:
1899 return AMDGPU::SI_SPILL_AV192_RESTORE;
1900 case 28:
1901 return AMDGPU::SI_SPILL_AV224_RESTORE;
1902 case 32:
1903 return AMDGPU::SI_SPILL_AV256_RESTORE;
1904 case 36:
1905 return AMDGPU::SI_SPILL_AV288_RESTORE;
1906 case 40:
1907 return AMDGPU::SI_SPILL_AV320_RESTORE;
1908 case 44:
1909 return AMDGPU::SI_SPILL_AV352_RESTORE;
1910 case 48:
1911 return AMDGPU::SI_SPILL_AV384_RESTORE;
1912 case 64:
1913 return AMDGPU::SI_SPILL_AV512_RESTORE;
1914 case 128:
1915 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1916 default:
1917 llvm_unreachable("unknown register size");
1918 }
1919}
1920
1921static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1922 bool IsVectorSuperClass) {
1923 // Currently, there is only 32-bit WWM register spills needed.
1924 if (Size != 4)
1925 llvm_unreachable("unknown wwm register spill size");
1926
1927 if (IsVectorSuperClass)
1928 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1929
1930 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1931}
1932
1933static unsigned
1935 unsigned Size, const SIRegisterInfo &TRI,
1936 const SIMachineFunctionInfo &MFI) {
1937 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1938
1939 // Choose the right opcode if restoring a WWM register.
1941 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1942
1943 if (IsVectorSuperClass)
1945
1946 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1948}
1949
1952 Register DestReg, int FrameIndex,
1953 const TargetRegisterClass *RC,
1954 const TargetRegisterInfo *TRI,
1955 Register VReg,
1956 MachineInstr::MIFlag Flags) const {
1959 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1960 const DebugLoc &DL = MBB.findDebugLoc(MI);
1961 unsigned SpillSize = TRI->getSpillSize(*RC);
1962
1963 MachinePointerInfo PtrInfo
1964 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1965
1967 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1968 FrameInfo.getObjectAlign(FrameIndex));
1969
1970 if (RI.isSGPRClass(RC)) {
1971 MFI->setHasSpilledSGPRs();
1972 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1973 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1974 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1975
1976 // FIXME: Maybe this should not include a memoperand because it will be
1977 // lowered to non-memory instructions.
1978 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1979 if (DestReg.isVirtual() && SpillSize == 4) {
1981 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1982 }
1983
1984 if (RI.spillSGPRToVGPR())
1985 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1986 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1987 .addFrameIndex(FrameIndex) // addr
1988 .addMemOperand(MMO)
1990
1991 return;
1992 }
1993
1994 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1995 SpillSize, RI, *MFI);
1996 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1997 .addFrameIndex(FrameIndex) // vaddr
1998 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1999 .addImm(0) // offset
2000 .addMemOperand(MMO);
2001}
2002
2005 insertNoops(MBB, MI, 1);
2006}
2007
2010 unsigned Quantity) const {
2012 while (Quantity > 0) {
2013 unsigned Arg = std::min(Quantity, 8u);
2014 Quantity -= Arg;
2015 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2016 }
2017}
2018
2020 auto *MF = MBB.getParent();
2022
2023 assert(Info->isEntryFunction());
2024
2025 if (MBB.succ_empty()) {
2026 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2027 if (HasNoTerminator) {
2028 if (Info->returnsVoid()) {
2029 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2030 } else {
2031 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2032 }
2033 }
2034 }
2035}
2036
2040 const DebugLoc &DL) const {
2042 constexpr unsigned DoorbellIDMask = 0x3ff;
2043 constexpr unsigned ECQueueWaveAbort = 0x400;
2044
2045 MachineBasicBlock *TrapBB = &MBB;
2046 MachineBasicBlock *ContBB = &MBB;
2047 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2048
2049 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2050 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2051 TrapBB = MF->CreateMachineBasicBlock();
2052 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2053 MF->push_back(TrapBB);
2054 MBB.addSuccessor(TrapBB);
2055 }
2056
2057 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2058 // will be a nop.
2059 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2060 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2061 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2062 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2063 DoorbellReg)
2065 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2066 .addUse(AMDGPU::M0);
2067 Register DoorbellRegMasked =
2068 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2069 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2070 .addUse(DoorbellReg)
2071 .addImm(DoorbellIDMask);
2072 Register SetWaveAbortBit =
2073 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2074 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2075 .addUse(DoorbellRegMasked)
2076 .addImm(ECQueueWaveAbort);
2077 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2078 .addUse(SetWaveAbortBit);
2079 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2081 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2082 .addUse(AMDGPU::TTMP2);
2083 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2084 TrapBB->addSuccessor(HaltLoopBB);
2085
2086 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2087 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2088 .addMBB(HaltLoopBB);
2089 MF->push_back(HaltLoopBB);
2090 HaltLoopBB->addSuccessor(HaltLoopBB);
2091
2092 return ContBB;
2093}
2094
2096 switch (MI.getOpcode()) {
2097 default:
2098 if (MI.isMetaInstruction())
2099 return 0;
2100 return 1; // FIXME: Do wait states equal cycles?
2101
2102 case AMDGPU::S_NOP:
2103 return MI.getOperand(0).getImm() + 1;
2104 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2105 // hazard, even if one exist, won't really be visible. Should we handle it?
2106 }
2107}
2108
2110 MachineBasicBlock &MBB = *MI.getParent();
2112 switch (MI.getOpcode()) {
2113 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2114 case AMDGPU::S_MOV_B64_term:
2115 // This is only a terminator to get the correct spill code placement during
2116 // register allocation.
2117 MI.setDesc(get(AMDGPU::S_MOV_B64));
2118 break;
2119
2120 case AMDGPU::S_MOV_B32_term:
2121 // This is only a terminator to get the correct spill code placement during
2122 // register allocation.
2123 MI.setDesc(get(AMDGPU::S_MOV_B32));
2124 break;
2125
2126 case AMDGPU::S_XOR_B64_term:
2127 // This is only a terminator to get the correct spill code placement during
2128 // register allocation.
2129 MI.setDesc(get(AMDGPU::S_XOR_B64));
2130 break;
2131
2132 case AMDGPU::S_XOR_B32_term:
2133 // This is only a terminator to get the correct spill code placement during
2134 // register allocation.
2135 MI.setDesc(get(AMDGPU::S_XOR_B32));
2136 break;
2137 case AMDGPU::S_OR_B64_term:
2138 // This is only a terminator to get the correct spill code placement during
2139 // register allocation.
2140 MI.setDesc(get(AMDGPU::S_OR_B64));
2141 break;
2142 case AMDGPU::S_OR_B32_term:
2143 // This is only a terminator to get the correct spill code placement during
2144 // register allocation.
2145 MI.setDesc(get(AMDGPU::S_OR_B32));
2146 break;
2147
2148 case AMDGPU::S_ANDN2_B64_term:
2149 // This is only a terminator to get the correct spill code placement during
2150 // register allocation.
2151 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2152 break;
2153
2154 case AMDGPU::S_ANDN2_B32_term:
2155 // This is only a terminator to get the correct spill code placement during
2156 // register allocation.
2157 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2158 break;
2159
2160 case AMDGPU::S_AND_B64_term:
2161 // This is only a terminator to get the correct spill code placement during
2162 // register allocation.
2163 MI.setDesc(get(AMDGPU::S_AND_B64));
2164 break;
2165
2166 case AMDGPU::S_AND_B32_term:
2167 // This is only a terminator to get the correct spill code placement during
2168 // register allocation.
2169 MI.setDesc(get(AMDGPU::S_AND_B32));
2170 break;
2171
2172 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2173 // This is only a terminator to get the correct spill code placement during
2174 // register allocation.
2175 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2176 break;
2177
2178 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2179 // This is only a terminator to get the correct spill code placement during
2180 // register allocation.
2181 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2182 break;
2183
2184 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2185 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2186 break;
2187
2188 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2189 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2190 break;
2191
2192 case AMDGPU::V_MOV_B64_PSEUDO: {
2193 Register Dst = MI.getOperand(0).getReg();
2194 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2195 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2196
2197 const MachineOperand &SrcOp = MI.getOperand(1);
2198 // FIXME: Will this work for 64-bit floating point immediates?
2199 assert(!SrcOp.isFPImm());
2200 if (ST.hasMovB64()) {
2201 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2202 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2203 isUInt<32>(SrcOp.getImm()))
2204 break;
2205 }
2206 if (SrcOp.isImm()) {
2207 APInt Imm(64, SrcOp.getImm());
2208 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2209 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2210 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2211 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2213 .addImm(Lo.getSExtValue())
2215 .addImm(Lo.getSExtValue())
2216 .addImm(0) // op_sel_lo
2217 .addImm(0) // op_sel_hi
2218 .addImm(0) // neg_lo
2219 .addImm(0) // neg_hi
2220 .addImm(0); // clamp
2221 } else {
2222 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2223 .addImm(Lo.getSExtValue())
2225 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2226 .addImm(Hi.getSExtValue())
2228 }
2229 } else {
2230 assert(SrcOp.isReg());
2231 if (ST.hasPkMovB32() &&
2232 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2233 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2234 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2235 .addReg(SrcOp.getReg())
2237 .addReg(SrcOp.getReg())
2238 .addImm(0) // op_sel_lo
2239 .addImm(0) // op_sel_hi
2240 .addImm(0) // neg_lo
2241 .addImm(0) // neg_hi
2242 .addImm(0); // clamp
2243 } else {
2244 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2245 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2247 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2248 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2250 }
2251 }
2252 MI.eraseFromParent();
2253 break;
2254 }
2255 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2257 break;
2258 }
2259 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2260 const MachineOperand &SrcOp = MI.getOperand(1);
2261 assert(!SrcOp.isFPImm());
2262 APInt Imm(64, SrcOp.getImm());
2263 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2264 MI.setDesc(get(AMDGPU::S_MOV_B64));
2265 break;
2266 }
2267
2268 Register Dst = MI.getOperand(0).getReg();
2269 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2270 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2271
2272 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2273 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2274 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2275 .addImm(Lo.getSExtValue())
2277 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2278 .addImm(Hi.getSExtValue())
2280 MI.eraseFromParent();
2281 break;
2282 }
2283 case AMDGPU::V_SET_INACTIVE_B32: {
2284 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2285 Register DstReg = MI.getOperand(0).getReg();
2286 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2287 .add(MI.getOperand(3))
2288 .add(MI.getOperand(4))
2289 .add(MI.getOperand(1))
2290 .add(MI.getOperand(2))
2291 .add(MI.getOperand(5));
2292 MI.eraseFromParent();
2293 break;
2294 }
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2298 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2305 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2306 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2324 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2325
2326 unsigned Opc;
2327 if (RI.hasVGPRs(EltRC)) {
2328 Opc = AMDGPU::V_MOVRELD_B32_e32;
2329 } else {
2330 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2331 : AMDGPU::S_MOVRELD_B32;
2332 }
2333
2334 const MCInstrDesc &OpDesc = get(Opc);
2335 Register VecReg = MI.getOperand(0).getReg();
2336 bool IsUndef = MI.getOperand(1).isUndef();
2337 unsigned SubReg = MI.getOperand(3).getImm();
2338 assert(VecReg == MI.getOperand(1).getReg());
2339
2341 BuildMI(MBB, MI, DL, OpDesc)
2342 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2343 .add(MI.getOperand(2))
2345 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2346
2347 const int ImpDefIdx =
2348 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2349 const int ImpUseIdx = ImpDefIdx + 1;
2350 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2351 MI.eraseFromParent();
2352 break;
2353 }
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2357 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2358 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2359 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2360 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2361 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2362 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2363 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2364 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2365 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2367 Register VecReg = MI.getOperand(0).getReg();
2368 bool IsUndef = MI.getOperand(1).isUndef();
2369 Register Idx = MI.getOperand(3).getReg();
2370 Register SubReg = MI.getOperand(4).getImm();
2371
2372 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2373 .addReg(Idx)
2375 SetOn->getOperand(3).setIsUndef();
2376
2377 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2379 BuildMI(MBB, MI, DL, OpDesc)
2380 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2381 .add(MI.getOperand(2))
2383 .addReg(VecReg,
2384 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2385
2386 const int ImpDefIdx =
2387 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2388 const int ImpUseIdx = ImpDefIdx + 1;
2389 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2390
2391 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2392
2393 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2394
2395 MI.eraseFromParent();
2396 break;
2397 }
2398 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2402 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2403 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2404 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2405 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2406 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2407 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2408 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2409 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2411 Register Dst = MI.getOperand(0).getReg();
2412 Register VecReg = MI.getOperand(1).getReg();
2413 bool IsUndef = MI.getOperand(1).isUndef();
2414 Register Idx = MI.getOperand(2).getReg();
2415 Register SubReg = MI.getOperand(3).getImm();
2416
2417 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2418 .addReg(Idx)
2420 SetOn->getOperand(3).setIsUndef();
2421
2422 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2423 .addDef(Dst)
2424 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2425 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2426
2427 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2428
2429 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2430
2431 MI.eraseFromParent();
2432 break;
2433 }
2434 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2435 MachineFunction &MF = *MBB.getParent();
2436 Register Reg = MI.getOperand(0).getReg();
2437 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2438 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2439 MachineOperand OpLo = MI.getOperand(1);
2440 MachineOperand OpHi = MI.getOperand(2);
2441
2442 // Create a bundle so these instructions won't be re-ordered by the
2443 // post-RA scheduler.
2444 MIBundleBuilder Bundler(MBB, MI);
2445 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2446
2447 // What we want here is an offset from the value returned by s_getpc (which
2448 // is the address of the s_add_u32 instruction) to the global variable, but
2449 // since the encoding of $symbol starts 4 bytes after the start of the
2450 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2451 // small. This requires us to add 4 to the global variable offset in order
2452 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2453 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2454 // instruction.
2455
2456 int64_t Adjust = 0;
2457 if (ST.hasGetPCZeroExtension()) {
2458 // Fix up hardware that does not sign-extend the 48-bit PC value by
2459 // inserting: s_sext_i32_i16 reghi, reghi
2460 Bundler.append(
2461 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2462 Adjust += 4;
2463 }
2464
2465 if (OpLo.isGlobal())
2466 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2467 Bundler.append(
2468 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2469
2470 if (OpHi.isGlobal())
2471 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2472 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2473 .addReg(RegHi)
2474 .add(OpHi));
2475
2476 finalizeBundle(MBB, Bundler.begin());
2477
2478 MI.eraseFromParent();
2479 break;
2480 }
2481 case AMDGPU::ENTER_STRICT_WWM: {
2482 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2483 // Whole Wave Mode is entered.
2484 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2485 : AMDGPU::S_OR_SAVEEXEC_B64));
2486 break;
2487 }
2488 case AMDGPU::ENTER_STRICT_WQM: {
2489 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2490 // STRICT_WQM is entered.
2491 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2492 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2493 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2494 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2495 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2496
2497 MI.eraseFromParent();
2498 break;
2499 }
2500 case AMDGPU::EXIT_STRICT_WWM:
2501 case AMDGPU::EXIT_STRICT_WQM: {
2502 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2503 // WWM/STICT_WQM is exited.
2504 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2505 break;
2506 }
2507 case AMDGPU::SI_RETURN: {
2508 const MachineFunction *MF = MBB.getParent();
2509 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2510 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2511 // Hiding the return address use with SI_RETURN may lead to extra kills in
2512 // the function and missing live-ins. We are fine in practice because callee
2513 // saved register handling ensures the register value is restored before
2514 // RET, but we need the undef flag here to appease the MachineVerifier
2515 // liveness checks.
2517 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2518 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2519
2520 MIB.copyImplicitOps(MI);
2521 MI.eraseFromParent();
2522 break;
2523 }
2524
2525 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2526 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2527 MI.setDesc(get(AMDGPU::S_MUL_U64));
2528 break;
2529
2530 case AMDGPU::S_GETPC_B64_pseudo:
2531 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2532 if (ST.hasGetPCZeroExtension()) {
2533 Register Dst = MI.getOperand(0).getReg();
2534 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2535 // Fix up hardware that does not sign-extend the 48-bit PC value by
2536 // inserting: s_sext_i32_i16 dsthi, dsthi
2537 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2538 DstHi)
2539 .addReg(DstHi);
2540 }
2541 break;
2542 }
2543 return true;
2544}
2545
2548 unsigned SubIdx, const MachineInstr &Orig,
2549 const TargetRegisterInfo &RI) const {
2550
2551 // Try shrinking the instruction to remat only the part needed for current
2552 // context.
2553 // TODO: Handle more cases.
2554 unsigned Opcode = Orig.getOpcode();
2555 switch (Opcode) {
2556 case AMDGPU::S_LOAD_DWORDX16_IMM:
2557 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2558 if (SubIdx != 0)
2559 break;
2560
2561 if (I == MBB.end())
2562 break;
2563
2564 if (I->isBundled())
2565 break;
2566
2567 // Look for a single use of the register that is also a subreg.
2568 Register RegToFind = Orig.getOperand(0).getReg();
2569 MachineOperand *UseMO = nullptr;
2570 for (auto &CandMO : I->operands()) {
2571 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2572 continue;
2573 if (UseMO) {
2574 UseMO = nullptr;
2575 break;
2576 }
2577 UseMO = &CandMO;
2578 }
2579 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2580 break;
2581
2582 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2583 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2584
2587 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2588
2589 unsigned NewOpcode = -1;
2590 if (SubregSize == 256)
2591 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2592 else if (SubregSize == 128)
2593 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2594 else
2595 break;
2596
2597 const MCInstrDesc &TID = get(NewOpcode);
2598 const TargetRegisterClass *NewRC =
2599 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2600 MRI.setRegClass(DestReg, NewRC);
2601
2602 UseMO->setReg(DestReg);
2603 UseMO->setSubReg(AMDGPU::NoSubRegister);
2604
2605 // Use a smaller load with the desired size, possibly with updated offset.
2606 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2607 MI->setDesc(TID);
2608 MI->getOperand(0).setReg(DestReg);
2609 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2610 if (Offset) {
2611 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2612 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2613 OffsetMO->setImm(FinalOffset);
2614 }
2616 for (const MachineMemOperand *MemOp : Orig.memoperands())
2617 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2618 SubregSize / 8));
2619 MI->setMemRefs(*MF, NewMMOs);
2620
2621 MBB.insert(I, MI);
2622 return;
2623 }
2624
2625 default:
2626 break;
2627 }
2628
2629 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2630}
2631
2632std::pair<MachineInstr*, MachineInstr*>
2634 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2635
2636 if (ST.hasMovB64() &&
2638 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2639 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2640 return std::pair(&MI, nullptr);
2641 }
2642
2643 MachineBasicBlock &MBB = *MI.getParent();
2647 Register Dst = MI.getOperand(0).getReg();
2648 unsigned Part = 0;
2649 MachineInstr *Split[2];
2650
2651 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2652 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2653 if (Dst.isPhysical()) {
2654 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2655 } else {
2656 assert(MRI.isSSA());
2657 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2658 MovDPP.addDef(Tmp);
2659 }
2660
2661 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2662 const MachineOperand &SrcOp = MI.getOperand(I);
2663 assert(!SrcOp.isFPImm());
2664 if (SrcOp.isImm()) {
2665 APInt Imm(64, SrcOp.getImm());
2666 Imm.ashrInPlace(Part * 32);
2667 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2668 } else {
2669 assert(SrcOp.isReg());
2670 Register Src = SrcOp.getReg();
2671 if (Src.isPhysical())
2672 MovDPP.addReg(RI.getSubReg(Src, Sub));
2673 else
2674 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2675 }
2676 }
2677
2678 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2679 MovDPP.addImm(MO.getImm());
2680
2681 Split[Part] = MovDPP;
2682 ++Part;
2683 }
2684
2685 if (Dst.isVirtual())
2686 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2687 .addReg(Split[0]->getOperand(0).getReg())
2688 .addImm(AMDGPU::sub0)
2689 .addReg(Split[1]->getOperand(0).getReg())
2690 .addImm(AMDGPU::sub1);
2691
2692 MI.eraseFromParent();
2693 return std::pair(Split[0], Split[1]);
2694}
2695
2696std::optional<DestSourcePair>
2698 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2699 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2700
2701 return std::nullopt;
2702}
2703
2705 MachineOperand &Src0,
2706 unsigned Src0OpName,
2707 MachineOperand &Src1,
2708 unsigned Src1OpName) const {
2709 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2710 if (!Src0Mods)
2711 return false;
2712
2713 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2714 assert(Src1Mods &&
2715 "All commutable instructions have both src0 and src1 modifiers");
2716
2717 int Src0ModsVal = Src0Mods->getImm();
2718 int Src1ModsVal = Src1Mods->getImm();
2719
2720 Src1Mods->setImm(Src0ModsVal);
2721 Src0Mods->setImm(Src1ModsVal);
2722 return true;
2723}
2724
2726 MachineOperand &RegOp,
2727 MachineOperand &NonRegOp) {
2728 Register Reg = RegOp.getReg();
2729 unsigned SubReg = RegOp.getSubReg();
2730 bool IsKill = RegOp.isKill();
2731 bool IsDead = RegOp.isDead();
2732 bool IsUndef = RegOp.isUndef();
2733 bool IsDebug = RegOp.isDebug();
2734
2735 if (NonRegOp.isImm())
2736 RegOp.ChangeToImmediate(NonRegOp.getImm());
2737 else if (NonRegOp.isFI())
2738 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2739 else if (NonRegOp.isGlobal()) {
2740 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2741 NonRegOp.getTargetFlags());
2742 } else
2743 return nullptr;
2744
2745 // Make sure we don't reinterpret a subreg index in the target flags.
2746 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2747
2748 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2749 NonRegOp.setSubReg(SubReg);
2750
2751 return &MI;
2752}
2753
2755 MachineOperand &NonRegOp1,
2756 MachineOperand &NonRegOp2) {
2757 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2758 int64_t NonRegVal = NonRegOp1.getImm();
2759
2760 NonRegOp1.setImm(NonRegOp2.getImm());
2761 NonRegOp2.setImm(NonRegVal);
2762 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2763 NonRegOp2.setTargetFlags(TargetFlags);
2764 return &MI;
2765}
2766
2767bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2768 const MachineOperand *MO0, unsigned OpIdx1,
2769 const MachineOperand *MO1) const {
2770 const MCInstrDesc &InstDesc = MI.getDesc();
2771 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2772 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2773 const TargetRegisterClass *DefinedRC1 =
2774 OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr;
2775 const TargetRegisterClass *DefinedRC0 =
2776 OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr;
2777
2778 unsigned Opc = MI.getOpcode();
2779 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2780
2781 // Swap doesn't breach constant bus or literal limits
2782 // It may move literal to position other than src0, this is not allowed
2783 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2784 // FIXME: After gfx9, literal can be in place other than Src0
2785 if (isVALU(MI)) {
2786 if ((int)OpIdx0 == Src0Idx && !MO0->isReg() &&
2787 !isInlineConstant(*MO0, OpInfo1))
2788 return false;
2789 if ((int)OpIdx1 == Src0Idx && !MO1->isReg() &&
2790 !isInlineConstant(*MO1, OpInfo0))
2791 return false;
2792 }
2793
2794 if ((int)OpIdx1 != Src0Idx && MO0->isReg()) {
2795 if (!DefinedRC1)
2796 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2797 return isLegalRegOperand(MI, OpIdx1, *MO0);
2798 }
2799 if ((int)OpIdx0 != Src0Idx && MO1->isReg()) {
2800 if (!DefinedRC0)
2801 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2802 return isLegalRegOperand(MI, OpIdx0, *MO1);
2803 }
2804
2805 // No need to check 64-bit literals since swapping does not bring new
2806 // 64-bit literals into current instruction to fold to 32-bit
2807
2808 return isImmOperandLegal(MI, OpIdx1, *MO0);
2809}
2810
2812 unsigned Src0Idx,
2813 unsigned Src1Idx) const {
2814 assert(!NewMI && "this should never be used");
2815
2816 unsigned Opc = MI.getOpcode();
2817 int CommutedOpcode = commuteOpcode(Opc);
2818 if (CommutedOpcode == -1)
2819 return nullptr;
2820
2821 if (Src0Idx > Src1Idx)
2822 std::swap(Src0Idx, Src1Idx);
2823
2824 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2825 static_cast<int>(Src0Idx) &&
2826 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2827 static_cast<int>(Src1Idx) &&
2828 "inconsistency with findCommutedOpIndices");
2829
2830 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2831 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2832 if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) {
2833 return nullptr;
2834 }
2835 MachineInstr *CommutedMI = nullptr;
2836 if (Src0.isReg() && Src1.isReg()) {
2837 // Be sure to copy the source modifiers to the right place.
2838 CommutedMI =
2839 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2840 } else if (Src0.isReg() && !Src1.isReg()) {
2841 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2842 } else if (!Src0.isReg() && Src1.isReg()) {
2843 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2844 } else if (Src0.isImm() && Src1.isImm()) {
2845 CommutedMI = swapImmOperands(MI, Src0, Src1);
2846 } else {
2847 // FIXME: Found two non registers to commute. This does happen.
2848 return nullptr;
2849 }
2850
2851 if (CommutedMI) {
2852 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2853 Src1, AMDGPU::OpName::src1_modifiers);
2854
2855 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2856 AMDGPU::OpName::src1_sel);
2857
2858 CommutedMI->setDesc(get(CommutedOpcode));
2859 }
2860
2861 return CommutedMI;
2862}
2863
2864// This needs to be implemented because the source modifiers may be inserted
2865// between the true commutable operands, and the base
2866// TargetInstrInfo::commuteInstruction uses it.
2868 unsigned &SrcOpIdx0,
2869 unsigned &SrcOpIdx1) const {
2870 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2871}
2872
2874 unsigned &SrcOpIdx0,
2875 unsigned &SrcOpIdx1) const {
2876 if (!Desc.isCommutable())
2877 return false;
2878
2879 unsigned Opc = Desc.getOpcode();
2880 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2881 if (Src0Idx == -1)
2882 return false;
2883
2884 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2885 if (Src1Idx == -1)
2886 return false;
2887
2888 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2889}
2890
2892 int64_t BrOffset) const {
2893 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2894 // block is unanalyzable.
2895 assert(BranchOp != AMDGPU::S_SETPC_B64);
2896
2897 // Convert to dwords.
2898 BrOffset /= 4;
2899
2900 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2901 // from the next instruction.
2902 BrOffset -= 1;
2903
2904 return isIntN(BranchOffsetBits, BrOffset);
2905}
2906
2909 return MI.getOperand(0).getMBB();
2910}
2911
2913 for (const MachineInstr &MI : MBB->terminators()) {
2914 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2915 MI.getOpcode() == AMDGPU::SI_LOOP)
2916 return true;
2917 }
2918 return false;
2919}
2920
2922 MachineBasicBlock &DestBB,
2923 MachineBasicBlock &RestoreBB,
2924 const DebugLoc &DL, int64_t BrOffset,
2925 RegScavenger *RS) const {
2926 assert(RS && "RegScavenger required for long branching");
2927 assert(MBB.empty() &&
2928 "new block should be inserted for expanding unconditional branch");
2929 assert(MBB.pred_size() == 1);
2930 assert(RestoreBB.empty() &&
2931 "restore block should be inserted for restoring clobbered registers");
2932
2936
2937 // FIXME: Virtual register workaround for RegScavenger not working with empty
2938 // blocks.
2939 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2940
2941 auto I = MBB.end();
2942
2943 // Note: as this is used after hazard recognizer we need to apply some hazard
2944 // workarounds directly.
2945 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2947 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2948 if (FlushSGPRWrites)
2949 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2951 };
2952
2953 // We need to compute the offset relative to the instruction immediately after
2954 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2955 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2956 ApplyHazardWorkarounds();
2957
2958 auto &MCCtx = MF->getContext();
2959 MCSymbol *PostGetPCLabel =
2960 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2961 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2962
2963 MCSymbol *OffsetLo =
2964 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2965 MCSymbol *OffsetHi =
2966 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2967 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2968 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2969 .addReg(PCReg, 0, AMDGPU::sub0)
2970 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2971 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2972 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2973 .addReg(PCReg, 0, AMDGPU::sub1)
2974 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2975 ApplyHazardWorkarounds();
2976
2977 // Insert the indirect branch after the other terminator.
2978 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2979 .addReg(PCReg);
2980
2981 // If a spill is needed for the pc register pair, we need to insert a spill
2982 // restore block right before the destination block, and insert a short branch
2983 // into the old destination block's fallthrough predecessor.
2984 // e.g.:
2985 //
2986 // s_cbranch_scc0 skip_long_branch:
2987 //
2988 // long_branch_bb:
2989 // spill s[8:9]
2990 // s_getpc_b64 s[8:9]
2991 // s_add_u32 s8, s8, restore_bb
2992 // s_addc_u32 s9, s9, 0
2993 // s_setpc_b64 s[8:9]
2994 //
2995 // skip_long_branch:
2996 // foo;
2997 //
2998 // .....
2999 //
3000 // dest_bb_fallthrough_predecessor:
3001 // bar;
3002 // s_branch dest_bb
3003 //
3004 // restore_bb:
3005 // restore s[8:9]
3006 // fallthrough dest_bb
3007 ///
3008 // dest_bb:
3009 // buzz;
3010
3011 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3012 Register Scav;
3013
3014 // If we've previously reserved a register for long branches
3015 // avoid running the scavenger and just use those registers
3016 if (LongBranchReservedReg) {
3017 RS->enterBasicBlock(MBB);
3018 Scav = LongBranchReservedReg;
3019 } else {
3021 Scav = RS->scavengeRegisterBackwards(
3022 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3023 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3024 }
3025 if (Scav) {
3026 RS->setRegUsed(Scav);
3027 MRI.replaceRegWith(PCReg, Scav);
3028 MRI.clearVirtRegs();
3029 } else {
3030 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3031 // SGPR spill.
3032 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3033 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3034 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3035 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3036 MRI.clearVirtRegs();
3037 }
3038
3039 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3040 // Now, the distance could be defined.
3042 MCSymbolRefExpr::create(DestLabel, MCCtx),
3043 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3044 // Add offset assignments.
3045 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3046 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3047 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3048 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3049}
3050
3051unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3052 switch (Cond) {
3053 case SIInstrInfo::SCC_TRUE:
3054 return AMDGPU::S_CBRANCH_SCC1;
3055 case SIInstrInfo::SCC_FALSE:
3056 return AMDGPU::S_CBRANCH_SCC0;
3057 case SIInstrInfo::VCCNZ:
3058 return AMDGPU::S_CBRANCH_VCCNZ;
3059 case SIInstrInfo::VCCZ:
3060 return AMDGPU::S_CBRANCH_VCCZ;
3061 case SIInstrInfo::EXECNZ:
3062 return AMDGPU::S_CBRANCH_EXECNZ;
3063 case SIInstrInfo::EXECZ:
3064 return AMDGPU::S_CBRANCH_EXECZ;
3065 default:
3066 llvm_unreachable("invalid branch predicate");
3067 }
3068}
3069
3070SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3071 switch (Opcode) {
3072 case AMDGPU::S_CBRANCH_SCC0:
3073 return SCC_FALSE;
3074 case AMDGPU::S_CBRANCH_SCC1:
3075 return SCC_TRUE;
3076 case AMDGPU::S_CBRANCH_VCCNZ:
3077 return VCCNZ;
3078 case AMDGPU::S_CBRANCH_VCCZ:
3079 return VCCZ;
3080 case AMDGPU::S_CBRANCH_EXECNZ:
3081 return EXECNZ;
3082 case AMDGPU::S_CBRANCH_EXECZ:
3083 return EXECZ;
3084 default:
3085 return INVALID_BR;
3086 }
3087}
3088
3092 MachineBasicBlock *&FBB,
3094 bool AllowModify) const {
3095 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3096 // Unconditional Branch
3097 TBB = I->getOperand(0).getMBB();
3098 return false;
3099 }
3100
3101 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3102 if (Pred == INVALID_BR)
3103 return true;
3104
3105 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3106 Cond.push_back(MachineOperand::CreateImm(Pred));
3107 Cond.push_back(I->getOperand(1)); // Save the branch register.
3108
3109 ++I;
3110
3111 if (I == MBB.end()) {
3112 // Conditional branch followed by fall-through.
3113 TBB = CondBB;
3114 return false;
3115 }
3116
3117 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3118 TBB = CondBB;
3119 FBB = I->getOperand(0).getMBB();
3120 return false;
3121 }
3122
3123 return true;
3124}
3125
3127 MachineBasicBlock *&FBB,
3129 bool AllowModify) const {
3131 auto E = MBB.end();
3132 if (I == E)
3133 return false;
3134
3135 // Skip over the instructions that are artificially terminators for special
3136 // exec management.
3137 while (I != E && !I->isBranch() && !I->isReturn()) {
3138 switch (I->getOpcode()) {
3139 case AMDGPU::S_MOV_B64_term:
3140 case AMDGPU::S_XOR_B64_term:
3141 case AMDGPU::S_OR_B64_term:
3142 case AMDGPU::S_ANDN2_B64_term:
3143 case AMDGPU::S_AND_B64_term:
3144 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3145 case AMDGPU::S_MOV_B32_term:
3146 case AMDGPU::S_XOR_B32_term:
3147 case AMDGPU::S_OR_B32_term:
3148 case AMDGPU::S_ANDN2_B32_term:
3149 case AMDGPU::S_AND_B32_term:
3150 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3151 break;
3152 case AMDGPU::SI_IF:
3153 case AMDGPU::SI_ELSE:
3154 case AMDGPU::SI_KILL_I1_TERMINATOR:
3155 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3156 // FIXME: It's messy that these need to be considered here at all.
3157 return true;
3158 default:
3159 llvm_unreachable("unexpected non-branch terminator inst");
3160 }
3161
3162 ++I;
3163 }
3164
3165 if (I == E)
3166 return false;
3167
3168 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3169}
3170
3172 int *BytesRemoved) const {
3173 unsigned Count = 0;
3174 unsigned RemovedSize = 0;
3176 // Skip over artificial terminators when removing instructions.
3177 if (MI.isBranch() || MI.isReturn()) {
3178 RemovedSize += getInstSizeInBytes(MI);
3179 MI.eraseFromParent();
3180 ++Count;
3181 }
3182 }
3183
3184 if (BytesRemoved)
3185 *BytesRemoved = RemovedSize;
3186
3187 return Count;
3188}
3189
3190// Copy the flags onto the implicit condition register operand.
3192 const MachineOperand &OrigCond) {
3193 CondReg.setIsUndef(OrigCond.isUndef());
3194 CondReg.setIsKill(OrigCond.isKill());
3195}
3196
3199 MachineBasicBlock *FBB,
3201 const DebugLoc &DL,
3202 int *BytesAdded) const {
3203 if (!FBB && Cond.empty()) {
3204 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3205 .addMBB(TBB);
3206 if (BytesAdded)
3207 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3208 return 1;
3209 }
3210
3211 assert(TBB && Cond[0].isImm());
3212
3213 unsigned Opcode
3214 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3215
3216 if (!FBB) {
3217 MachineInstr *CondBr =
3218 BuildMI(&MBB, DL, get(Opcode))
3219 .addMBB(TBB);
3220
3221 // Copy the flags onto the implicit condition register operand.
3222 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3223 fixImplicitOperands(*CondBr);
3224
3225 if (BytesAdded)
3226 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3227 return 1;
3228 }
3229
3230 assert(TBB && FBB);
3231
3232 MachineInstr *CondBr =
3233 BuildMI(&MBB, DL, get(Opcode))
3234 .addMBB(TBB);
3235 fixImplicitOperands(*CondBr);
3236 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3237 .addMBB(FBB);
3238
3239 MachineOperand &CondReg = CondBr->getOperand(1);
3240 CondReg.setIsUndef(Cond[1].isUndef());
3241 CondReg.setIsKill(Cond[1].isKill());
3242
3243 if (BytesAdded)
3244 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3245
3246 return 2;
3247}
3248
3251 if (Cond.size() != 2) {
3252 return true;
3253 }
3254
3255 if (Cond[0].isImm()) {
3256 Cond[0].setImm(-Cond[0].getImm());
3257 return false;
3258 }
3259
3260 return true;
3261}
3262
3265 Register DstReg, Register TrueReg,
3266 Register FalseReg, int &CondCycles,
3267 int &TrueCycles, int &FalseCycles) const {
3268 switch (Cond[0].getImm()) {
3269 case VCCNZ:
3270 case VCCZ: {
3272 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3273 if (MRI.getRegClass(FalseReg) != RC)
3274 return false;
3275
3276 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3277 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3278
3279 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3280 return RI.hasVGPRs(RC) && NumInsts <= 6;
3281 }
3282 case SCC_TRUE:
3283 case SCC_FALSE: {
3284 // FIXME: We could insert for VGPRs if we could replace the original compare
3285 // with a vector one.
3287 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3288 if (MRI.getRegClass(FalseReg) != RC)
3289 return false;
3290
3291 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3292
3293 // Multiples of 8 can do s_cselect_b64
3294 if (NumInsts % 2 == 0)
3295 NumInsts /= 2;
3296
3297 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3298 return RI.isSGPRClass(RC);
3299 }
3300 default:
3301 return false;
3302 }
3303}
3304
3308 Register TrueReg, Register FalseReg) const {
3309 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3310 if (Pred == VCCZ || Pred == SCC_FALSE) {
3311 Pred = static_cast<BranchPredicate>(-Pred);
3312 std::swap(TrueReg, FalseReg);
3313 }
3314
3316 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3317 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3318
3319 if (DstSize == 32) {
3321 if (Pred == SCC_TRUE) {
3322 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3323 .addReg(TrueReg)
3324 .addReg(FalseReg);
3325 } else {
3326 // Instruction's operands are backwards from what is expected.
3327 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3328 .addReg(FalseReg)
3329 .addReg(TrueReg);
3330 }
3331
3332 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3333 return;
3334 }
3335
3336 if (DstSize == 64 && Pred == SCC_TRUE) {
3338 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3339 .addReg(TrueReg)
3340 .addReg(FalseReg);
3341
3342 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3343 return;
3344 }
3345
3346 static const int16_t Sub0_15[] = {
3347 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3348 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3349 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3350 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3351 };
3352
3353 static const int16_t Sub0_15_64[] = {
3354 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3355 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3356 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3357 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3358 };
3359
3360 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3361 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3362 const int16_t *SubIndices = Sub0_15;
3363 int NElts = DstSize / 32;
3364
3365 // 64-bit select is only available for SALU.
3366 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3367 if (Pred == SCC_TRUE) {
3368 if (NElts % 2) {
3369 SelOp = AMDGPU::S_CSELECT_B32;
3370 EltRC = &AMDGPU::SGPR_32RegClass;
3371 } else {
3372 SelOp = AMDGPU::S_CSELECT_B64;
3373 EltRC = &AMDGPU::SGPR_64RegClass;
3374 SubIndices = Sub0_15_64;
3375 NElts /= 2;
3376 }
3377 }
3378
3380 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3381
3382 I = MIB->getIterator();
3383
3385 for (int Idx = 0; Idx != NElts; ++Idx) {
3386 Register DstElt = MRI.createVirtualRegister(EltRC);
3387 Regs.push_back(DstElt);
3388
3389 unsigned SubIdx = SubIndices[Idx];
3390
3392 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3393 Select =
3394 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3395 .addReg(FalseReg, 0, SubIdx)
3396 .addReg(TrueReg, 0, SubIdx);
3397 } else {
3398 Select =
3399 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3400 .addReg(TrueReg, 0, SubIdx)
3401 .addReg(FalseReg, 0, SubIdx);
3402 }
3403
3404 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3406
3407 MIB.addReg(DstElt)
3408 .addImm(SubIdx);
3409 }
3410}
3411
3413 switch (MI.getOpcode()) {
3414 case AMDGPU::V_MOV_B16_t16_e32:
3415 case AMDGPU::V_MOV_B16_t16_e64:
3416 case AMDGPU::V_MOV_B32_e32:
3417 case AMDGPU::V_MOV_B32_e64:
3418 case AMDGPU::V_MOV_B64_PSEUDO:
3419 case AMDGPU::V_MOV_B64_e32:
3420 case AMDGPU::V_MOV_B64_e64:
3421 case AMDGPU::S_MOV_B32:
3422 case AMDGPU::S_MOV_B64:
3423 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3424 case AMDGPU::COPY:
3425 case AMDGPU::WWM_COPY:
3426 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3427 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3428 case AMDGPU::V_ACCVGPR_MOV_B32:
3429 return true;
3430 default:
3431 return false;
3432 }
3433}
3434
3435static constexpr unsigned ModifierOpNames[] = {
3436 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3437 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3438 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3439
3441 unsigned Opc = MI.getOpcode();
3442 for (unsigned Name : reverse(ModifierOpNames)) {
3444 if (Idx >= 0)
3445 MI.removeOperand(Idx);
3446 }
3447}
3448
3450 Register Reg, MachineRegisterInfo *MRI) const {
3451 if (!MRI->hasOneNonDBGUse(Reg))
3452 return false;
3453
3454 switch (DefMI.getOpcode()) {
3455 default:
3456 return false;
3457 case AMDGPU::V_MOV_B64_e32:
3458 case AMDGPU::S_MOV_B64:
3459 case AMDGPU::V_MOV_B64_PSEUDO:
3460 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3461 case AMDGPU::V_MOV_B32_e32:
3462 case AMDGPU::S_MOV_B32:
3463 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3464 break;
3465 }
3466
3467 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3468 assert(ImmOp);
3469 // FIXME: We could handle FrameIndex values here.
3470 if (!ImmOp->isImm())
3471 return false;
3472
3473 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3474 int64_t Imm = ImmOp->getImm();
3475 switch (UseOp.getSubReg()) {
3476 default:
3477 return Imm;
3478 case AMDGPU::sub0:
3479 return Lo_32(Imm);
3480 case AMDGPU::sub1:
3481 return Hi_32(Imm);
3482 case AMDGPU::lo16:
3483 return SignExtend64<16>(Imm);
3484 case AMDGPU::hi16:
3485 return SignExtend64<16>(Imm >> 16);
3486 case AMDGPU::sub1_lo16:
3487 return SignExtend64<16>(Imm >> 32);
3488 case AMDGPU::sub1_hi16:
3489 return SignExtend64<16>(Imm >> 48);
3490 }
3491 };
3492
3493 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3494
3495 unsigned Opc = UseMI.getOpcode();
3496 if (Opc == AMDGPU::COPY) {
3497 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3498
3499 Register DstReg = UseMI.getOperand(0).getReg();
3500 unsigned OpSize = getOpSize(UseMI, 0);
3501 bool Is16Bit = OpSize == 2;
3502 bool Is64Bit = OpSize == 8;
3503 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3504 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3505 : AMDGPU::V_MOV_B32_e32
3506 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3507 : AMDGPU::S_MOV_B32;
3508 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)),
3509 /*isSigned=*/true, /*implicitTrunc=*/true);
3510
3511 if (RI.isAGPR(*MRI, DstReg)) {
3512 if (Is64Bit || !isInlineConstant(Imm))
3513 return false;
3514 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3515 }
3516
3517 if (Is16Bit) {
3518 if (isVGPRCopy)
3519 return false; // Do not clobber vgpr_hi16
3520
3521 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3522 return false;
3523
3524 UseMI.getOperand(0).setSubReg(0);
3525 if (DstReg.isPhysical()) {
3526 DstReg = RI.get32BitRegister(DstReg);
3527 UseMI.getOperand(0).setReg(DstReg);
3528 }
3529 assert(UseMI.getOperand(1).getReg().isVirtual());
3530 }
3531
3532 const MCInstrDesc &NewMCID = get(NewOpc);
3533 if (DstReg.isPhysical() &&
3534 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3535 return false;
3536
3537 UseMI.setDesc(NewMCID);
3538 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3539 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3540 return true;
3541 }
3542
3543 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3544 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3545 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3546 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3547 Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
3548 // Don't fold if we are using source or output modifiers. The new VOP2
3549 // instructions don't have them.
3551 return false;
3552
3553 // If this is a free constant, there's no reason to do this.
3554 // TODO: We could fold this here instead of letting SIFoldOperands do it
3555 // later.
3556 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3557
3558 // Any src operand can be used for the legality check.
3559 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3560 return false;
3561
3562 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3563 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3564 bool IsFMA =
3565 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3566 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3567 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3568 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3569 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3570
3571 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3572 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3573 (Src1->isReg() && Src1->getReg() == Reg)) {
3574 MachineOperand *RegSrc =
3575 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3576 if (!RegSrc->isReg())
3577 return false;
3578 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3579 ST.getConstantBusLimit(Opc) < 2)
3580 return false;
3581
3582 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3583 return false;
3584
3585 // If src2 is also a literal constant then we have to choose which one to
3586 // fold. In general it is better to choose madak so that the other literal
3587 // can be materialized in an sgpr instead of a vgpr:
3588 // s_mov_b32 s0, literal
3589 // v_madak_f32 v0, s0, v0, literal
3590 // Instead of:
3591 // v_mov_b32 v1, literal
3592 // v_madmk_f32 v0, v0, literal, v1
3593 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3594 if (Def && Def->isMoveImmediate() &&
3595 !isInlineConstant(Def->getOperand(1)))
3596 return false;
3597
3598 unsigned NewOpc =
3599 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3600 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3601 : AMDGPU::V_FMAMK_F16)
3602 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3603 if (pseudoToMCOpcode(NewOpc) == -1)
3604 return false;
3605
3606 // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3607 // would also require restricting their register classes. For now
3608 // just bail out.
3609 if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3610 return false;
3611
3612 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3613
3614 // FIXME: This would be a lot easier if we could return a new instruction
3615 // instead of having to modify in place.
3616
3617 Register SrcReg = RegSrc->getReg();
3618 unsigned SrcSubReg = RegSrc->getSubReg();
3619 Src0->setReg(SrcReg);
3620 Src0->setSubReg(SrcSubReg);
3621 Src0->setIsKill(RegSrc->isKill());
3622
3623 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3624 Opc == AMDGPU::V_FMAC_F32_e64 ||
3625 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3626 UseMI.untieRegOperand(
3627 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3628
3629 Src1->ChangeToImmediate(Imm);
3630
3632 UseMI.setDesc(get(NewOpc));
3633
3634 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3635 if (DeleteDef)
3636 DefMI.eraseFromParent();
3637
3638 return true;
3639 }
3640
3641 // Added part is the constant: Use v_madak_{f16, f32}.
3642 if (Src2->isReg() && Src2->getReg() == Reg) {
3643 if (ST.getConstantBusLimit(Opc) < 2) {
3644 // Not allowed to use constant bus for another operand.
3645 // We can however allow an inline immediate as src0.
3646 bool Src0Inlined = false;
3647 if (Src0->isReg()) {
3648 // Try to inline constant if possible.
3649 // If the Def moves immediate and the use is single
3650 // We are saving VGPR here.
3651 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3652 if (Def && Def->isMoveImmediate() &&
3653 isInlineConstant(Def->getOperand(1)) &&
3654 MRI->hasOneUse(Src0->getReg())) {
3655 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3656 Src0Inlined = true;
3657 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3658 RI.isSGPRReg(*MRI, Src0->getReg())) {
3659 return false;
3660 }
3661 // VGPR is okay as Src0 - fallthrough
3662 }
3663
3664 if (Src1->isReg() && !Src0Inlined) {
3665 // We have one slot for inlinable constant so far - try to fill it
3666 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3667 if (Def && Def->isMoveImmediate() &&
3668 isInlineConstant(Def->getOperand(1)) &&
3669 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3670 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3671 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3672 return false;
3673 // VGPR is okay as Src1 - fallthrough
3674 }
3675 }
3676
3677 unsigned NewOpc =
3678 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3679 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3680 : AMDGPU::V_FMAAK_F16)
3681 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3682 if (pseudoToMCOpcode(NewOpc) == -1)
3683 return false;
3684
3685 // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3686 // would also require restricting their register classes. For now
3687 // just bail out.
3688 if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3689 return false;
3690
3691 // FIXME: This would be a lot easier if we could return a new instruction
3692 // instead of having to modify in place.
3693
3694 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3695 Opc == AMDGPU::V_FMAC_F32_e64 ||
3696 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3697 UseMI.untieRegOperand(
3698 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3699
3700 // ChangingToImmediate adds Src2 back to the instruction.
3701 Src2->ChangeToImmediate(getImmFor(*Src2));
3702
3703 // These come before src2.
3705 UseMI.setDesc(get(NewOpc));
3706 // It might happen that UseMI was commuted
3707 // and we now have SGPR as SRC1. If so 2 inlined
3708 // constant and SGPR are illegal.
3710
3711 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3712 if (DeleteDef)
3713 DefMI.eraseFromParent();
3714
3715 return true;
3716 }
3717 }
3718
3719 return false;
3720}
3721
3722static bool
3725 if (BaseOps1.size() != BaseOps2.size())
3726 return false;
3727 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3728 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3729 return false;
3730 }
3731 return true;
3732}
3733
3734static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3735 LocationSize WidthB, int OffsetB) {
3736 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3737 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3738 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3739 return LowWidth.hasValue() &&
3740 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3741}
3742
3743bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3744 const MachineInstr &MIb) const {
3745 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3746 int64_t Offset0, Offset1;
3747 LocationSize Dummy0 = 0, Dummy1 = 0;
3748 bool Offset0IsScalable, Offset1IsScalable;
3749 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3750 Dummy0, &RI) ||
3751 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3752 Dummy1, &RI))
3753 return false;
3754
3755 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3756 return false;
3757
3758 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3759 // FIXME: Handle ds_read2 / ds_write2.
3760 return false;
3761 }
3762 LocationSize Width0 = MIa.memoperands().front()->getSize();
3763 LocationSize Width1 = MIb.memoperands().front()->getSize();
3764 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3765}
3766
3768 const MachineInstr &MIb) const {
3769 assert(MIa.mayLoadOrStore() &&
3770 "MIa must load from or modify a memory location");
3771 assert(MIb.mayLoadOrStore() &&
3772 "MIb must load from or modify a memory location");
3773
3775 return false;
3776
3777 // XXX - Can we relax this between address spaces?
3778 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3779 return false;
3780
3781 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3782 return false;
3783
3784 // TODO: Should we check the address space from the MachineMemOperand? That
3785 // would allow us to distinguish objects we know don't alias based on the
3786 // underlying address space, even if it was lowered to a different one,
3787 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3788 // buffer.
3789 if (isDS(MIa)) {
3790 if (isDS(MIb))
3791 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3792
3793 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3794 }
3795
3796 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3797 if (isMUBUF(MIb) || isMTBUF(MIb))
3798 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3799
3800 if (isFLAT(MIb))
3801 return isFLATScratch(MIb);
3802
3803 return !isSMRD(MIb);
3804 }
3805
3806 if (isSMRD(MIa)) {
3807 if (isSMRD(MIb))
3808 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3809
3810 if (isFLAT(MIb))
3811 return isFLATScratch(MIb);
3812
3813 return !isMUBUF(MIb) && !isMTBUF(MIb);
3814 }
3815
3816 if (isFLAT(MIa)) {
3817 if (isFLAT(MIb)) {
3818 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3819 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3820 return true;
3821
3822 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3823 }
3824
3825 return false;
3826 }
3827
3828 return false;
3829}
3830
3832 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3833 if (Reg.isPhysical())
3834 return false;
3835 auto *Def = MRI.getUniqueVRegDef(Reg);
3836 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3837 Imm = Def->getOperand(1).getImm();
3838 if (DefMI)
3839 *DefMI = Def;
3840 return true;
3841 }
3842 return false;
3843}
3844
3845static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3846 MachineInstr **DefMI = nullptr) {
3847 if (!MO->isReg())
3848 return false;
3849 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3850 const MachineRegisterInfo &MRI = MF->getRegInfo();
3851 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3852}
3853
3855 MachineInstr &NewMI) {
3856 if (LV) {
3857 unsigned NumOps = MI.getNumOperands();
3858 for (unsigned I = 1; I < NumOps; ++I) {
3859 MachineOperand &Op = MI.getOperand(I);
3860 if (Op.isReg() && Op.isKill())
3861 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3862 }
3863 }
3864}
3865
3866static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
3867 switch (Opc) {
3868 case AMDGPU::V_MAC_F16_e32:
3869 case AMDGPU::V_MAC_F16_e64:
3870 return AMDGPU::V_MAD_F16_e64;
3871 case AMDGPU::V_MAC_F32_e32:
3872 case AMDGPU::V_MAC_F32_e64:
3873 return AMDGPU::V_MAD_F32_e64;
3874 case AMDGPU::V_MAC_LEGACY_F32_e32:
3875 case AMDGPU::V_MAC_LEGACY_F32_e64:
3876 return AMDGPU::V_MAD_LEGACY_F32_e64;
3877 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3878 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3879 return AMDGPU::V_FMA_LEGACY_F32_e64;
3880 case AMDGPU::V_FMAC_F16_e32:
3881 case AMDGPU::V_FMAC_F16_e64:
3882 case AMDGPU::V_FMAC_F16_fake16_e64:
3883 return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3884 : AMDGPU::V_FMA_F16_gfx9_e64;
3885 case AMDGPU::V_FMAC_F32_e32:
3886 case AMDGPU::V_FMAC_F32_e64:
3887 return AMDGPU::V_FMA_F32_e64;
3888 case AMDGPU::V_FMAC_F64_e32:
3889 case AMDGPU::V_FMAC_F64_e64:
3890 return AMDGPU::V_FMA_F64_e64;
3891 default:
3892 llvm_unreachable("invalid instruction");
3893 }
3894}
3895
3897 LiveVariables *LV,
3898 LiveIntervals *LIS) const {
3899 MachineBasicBlock &MBB = *MI.getParent();
3900 unsigned Opc = MI.getOpcode();
3901
3902 // Handle MFMA.
3903 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3904 if (NewMFMAOpc != -1) {
3906 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3907 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3908 MIB.add(MI.getOperand(I));
3909 updateLiveVariables(LV, MI, *MIB);
3910 if (LIS) {
3911 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3912 // SlotIndex of defs needs to be updated when converting to early-clobber
3913 MachineOperand &Def = MIB->getOperand(0);
3914 if (Def.isEarlyClobber() && Def.isReg() &&
3915 LIS->hasInterval(Def.getReg())) {
3916 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3917 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3918 auto &LI = LIS->getInterval(Def.getReg());
3919 auto UpdateDefIndex = [&](LiveRange &LR) {
3920 auto *S = LR.find(OldIndex);
3921 if (S != LR.end() && S->start == OldIndex) {
3922 assert(S->valno && S->valno->def == OldIndex);
3923 S->start = NewIndex;
3924 S->valno->def = NewIndex;
3925 }
3926 };
3927 UpdateDefIndex(LI);
3928 for (auto &SR : LI.subranges())
3929 UpdateDefIndex(SR);
3930 }
3931 }
3932 return MIB;
3933 }
3934
3935 if (SIInstrInfo::isWMMA(MI)) {
3936 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3937 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3938 .setMIFlags(MI.getFlags());
3939 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3940 MIB->addOperand(MI.getOperand(I));
3941
3942 updateLiveVariables(LV, MI, *MIB);
3943 if (LIS)
3944 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3945
3946 return MIB;
3947 }
3948
3949 assert(
3950 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3951 "V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3952 "pre-RA");
3953
3954 // Handle MAC/FMAC.
3955 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3956 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3957 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3958 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3959 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3960 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3961 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3962 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3963 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3964 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3965 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3966 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3967 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3968 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3969 bool Src0Literal = false;
3970
3971 switch (Opc) {
3972 default:
3973 return nullptr;
3974 case AMDGPU::V_MAC_F16_e64:
3975 case AMDGPU::V_FMAC_F16_e64:
3976 case AMDGPU::V_FMAC_F16_fake16_e64:
3977 case AMDGPU::V_MAC_F32_e64:
3978 case AMDGPU::V_MAC_LEGACY_F32_e64:
3979 case AMDGPU::V_FMAC_F32_e64:
3980 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3981 case AMDGPU::V_FMAC_F64_e64:
3982 break;
3983 case AMDGPU::V_MAC_F16_e32:
3984 case AMDGPU::V_FMAC_F16_e32:
3985 case AMDGPU::V_MAC_F32_e32:
3986 case AMDGPU::V_MAC_LEGACY_F32_e32:
3987 case AMDGPU::V_FMAC_F32_e32:
3988 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3989 case AMDGPU::V_FMAC_F64_e32: {
3990 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3991 AMDGPU::OpName::src0);
3992 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3993 if (!Src0->isReg() && !Src0->isImm())
3994 return nullptr;
3995
3996 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3997 Src0Literal = true;
3998
3999 break;
4000 }
4001 }
4002
4004 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4005 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4006 const MachineOperand *Src0Mods =
4007 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4008 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4009 const MachineOperand *Src1Mods =
4010 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4011 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4012 const MachineOperand *Src2Mods =
4013 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4014 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4015 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4016 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4017
4018 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
4019 !IsLegacy &&
4020 // If we have an SGPR input, we will violate the constant bus restriction.
4021 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4022 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4024 const auto killDef = [&]() -> void {
4026 // The only user is the instruction which will be killed.
4027 Register DefReg = DefMI->getOperand(0).getReg();
4028
4029 if (MRI.hasOneNonDBGUse(DefReg)) {
4030 // We cannot just remove the DefMI here, calling pass will crash.
4031 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
4032 DefMI->getOperand(0).setIsDead(true);
4033 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
4035 if (LV)
4036 LV->getVarInfo(DefReg).AliveBlocks.clear();
4037 }
4038
4039 if (LIS) {
4040 LiveInterval &DefLI = LIS->getInterval(DefReg);
4041
4042 // We cannot delete the original instruction here, so hack out the use
4043 // in the original instruction with a dummy register so we can use
4044 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4045 // not have the complexity of deleting a use to consider here.
4046 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4047 for (MachineOperand &MIOp : MI.uses()) {
4048 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4049 MIOp.setIsUndef(true);
4050 MIOp.setReg(DummyReg);
4051 }
4052 }
4053
4054 LIS->shrinkToUses(&DefLI);
4055 }
4056 };
4057
4058 int64_t Imm;
4059 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4060 unsigned NewOpc =
4061 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
4062 : AMDGPU::V_FMAAK_F16)
4063 : AMDGPU::V_FMAAK_F32)
4064 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
4065 if (pseudoToMCOpcode(NewOpc) != -1) {
4066 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4067 .add(*Dst)
4068 .add(*Src0)
4069 .add(*Src1)
4070 .addImm(Imm)
4071 .setMIFlags(MI.getFlags());
4072 updateLiveVariables(LV, MI, *MIB);
4073 if (LIS)
4074 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4075 killDef();
4076 return MIB;
4077 }
4078 }
4079 unsigned NewOpc =
4080 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
4081 : AMDGPU::V_FMAMK_F16)
4082 : AMDGPU::V_FMAMK_F32)
4083 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4084 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4085 if (pseudoToMCOpcode(NewOpc) != -1) {
4086 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4087 .add(*Dst)
4088 .add(*Src0)
4089 .addImm(Imm)
4090 .add(*Src2)
4091 .setMIFlags(MI.getFlags());
4092 updateLiveVariables(LV, MI, *MIB);
4093
4094 if (LIS)
4095 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4096 killDef();
4097 return MIB;
4098 }
4099 }
4100 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4101 if (Src0Literal) {
4102 Imm = Src0->getImm();
4103 DefMI = nullptr;
4104 }
4105 if (pseudoToMCOpcode(NewOpc) != -1 &&
4107 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4108 Src1)) {
4109 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4110 .add(*Dst)
4111 .add(*Src1)
4112 .addImm(Imm)
4113 .add(*Src2)
4114 .setMIFlags(MI.getFlags());
4115 updateLiveVariables(LV, MI, *MIB);
4116
4117 if (LIS)
4118 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4119 if (DefMI)
4120 killDef();
4121 return MIB;
4122 }
4123 }
4124 }
4125
4126 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4127 // if VOP3 does not allow a literal operand.
4128 if (Src0Literal && !ST.hasVOP3Literal())
4129 return nullptr;
4130
4131 unsigned NewOpc = getNewFMAInst(ST, Opc);
4132
4133 if (pseudoToMCOpcode(NewOpc) == -1)
4134 return nullptr;
4135
4136 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4137 .add(*Dst)
4138 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4139 .add(*Src0)
4140 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4141 .add(*Src1)
4142 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4143 .add(*Src2)
4144 .addImm(Clamp ? Clamp->getImm() : 0)
4145 .addImm(Omod ? Omod->getImm() : 0)
4146 .setMIFlags(MI.getFlags());
4147 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4148 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4149 updateLiveVariables(LV, MI, *MIB);
4150 if (LIS)
4151 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4152 return MIB;
4153}
4154
4155// It's not generally safe to move VALU instructions across these since it will
4156// start using the register as a base index rather than directly.
4157// XXX - Why isn't hasSideEffects sufficient for these?
4159 switch (MI.getOpcode()) {
4160 case AMDGPU::S_SET_GPR_IDX_ON:
4161 case AMDGPU::S_SET_GPR_IDX_MODE:
4162 case AMDGPU::S_SET_GPR_IDX_OFF:
4163 return true;
4164 default:
4165 return false;
4166 }
4167}
4168
4170 const MachineBasicBlock *MBB,
4171 const MachineFunction &MF) const {
4172 // Skipping the check for SP writes in the base implementation. The reason it
4173 // was added was apparently due to compile time concerns.
4174 //
4175 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4176 // but is probably avoidable.
4177
4178 // Copied from base implementation.
4179 // Terminators and labels can't be scheduled around.
4180 if (MI.isTerminator() || MI.isPosition())
4181 return true;
4182
4183 // INLINEASM_BR can jump to another block
4184 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4185 return true;
4186
4187 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4188 return true;
4189
4190 // Target-independent instructions do not have an implicit-use of EXEC, even
4191 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4192 // boundaries prevents incorrect movements of such instructions.
4193 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4194 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4195 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4196 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4198}
4199
4201 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4202}
4203
4205 // Skip the full operand and register alias search modifiesRegister
4206 // does. There's only a handful of instructions that touch this, it's only an
4207 // implicit def, and doesn't alias any other registers.
4208 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4209}
4210
4212 unsigned Opcode = MI.getOpcode();
4213
4214 if (MI.mayStore() && isSMRD(MI))
4215 return true; // scalar store or atomic
4216
4217 // This will terminate the function when other lanes may need to continue.
4218 if (MI.isReturn())
4219 return true;
4220
4221 // These instructions cause shader I/O that may cause hardware lockups
4222 // when executed with an empty EXEC mask.
4223 //
4224 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4225 // EXEC = 0, but checking for that case here seems not worth it
4226 // given the typical code patterns.
4227 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4228 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4229 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4230 return true;
4231
4232 if (MI.isCall() || MI.isInlineAsm())
4233 return true; // conservative assumption
4234
4235 // Assume that barrier interactions are only intended with active lanes.
4236 if (isBarrier(Opcode))
4237 return true;
4238
4239 // A mode change is a scalar operation that influences vector instructions.
4241 return true;
4242
4243 // These are like SALU instructions in terms of effects, so it's questionable
4244 // whether we should return true for those.
4245 //
4246 // However, executing them with EXEC = 0 causes them to operate on undefined
4247 // data, which we avoid by returning true here.
4248 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4249 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4250 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4251 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4252 return true;
4253
4254 return false;
4255}
4256
4258 const MachineInstr &MI) const {
4259 if (MI.isMetaInstruction())
4260 return false;
4261
4262 // This won't read exec if this is an SGPR->SGPR copy.
4263 if (MI.isCopyLike()) {
4264 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4265 return true;
4266
4267 // Make sure this isn't copying exec as a normal operand
4268 return MI.readsRegister(AMDGPU::EXEC, &RI);
4269 }
4270
4271 // Make a conservative assumption about the callee.
4272 if (MI.isCall())
4273 return true;
4274
4275 // Be conservative with any unhandled generic opcodes.
4276 if (!isTargetSpecificOpcode(MI.getOpcode()))
4277 return true;
4278
4279 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4280}
4281
4282bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4283 switch (Imm.getBitWidth()) {
4284 case 1: // This likely will be a condition code mask.
4285 return true;
4286
4287 case 32:
4288 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4289 ST.hasInv2PiInlineImm());
4290 case 64:
4291 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4292 ST.hasInv2PiInlineImm());
4293 case 16:
4294 return ST.has16BitInsts() &&
4295 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4296 ST.hasInv2PiInlineImm());
4297 default:
4298 llvm_unreachable("invalid bitwidth");
4299 }
4300}
4301
4303 APInt IntImm = Imm.bitcastToAPInt();
4304 int64_t IntImmVal = IntImm.getSExtValue();
4305 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4306 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4307 default:
4308 llvm_unreachable("invalid fltSemantics");
4311 return isInlineConstant(IntImm);
4313 return ST.has16BitInsts() &&
4314 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4316 return ST.has16BitInsts() &&
4317 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4318 }
4319}
4320
4322 uint8_t OperandType) const {
4323 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4324 if (!MO.isImm())
4325 return false;
4326
4327 // MachineOperand provides no way to tell the true operand size, since it only
4328 // records a 64-bit value. We need to know the size to determine if a 32-bit
4329 // floating point immediate bit pattern is legal for an integer immediate. It
4330 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4331
4332 int64_t Imm = MO.getImm();
4333 switch (OperandType) {
4346 int32_t Trunc = static_cast<int32_t>(Imm);
4348 }
4355 ST.hasInv2PiInlineImm());
4359 // We would expect inline immediates to not be concerned with an integer/fp
4360 // distinction. However, in the case of 16-bit integer operations, the
4361 // "floating point" values appear to not work. It seems read the low 16-bits
4362 // of 32-bit immediates, which happens to always work for the integer
4363 // values.
4364 //
4365 // See llvm bugzilla 46302.
4366 //
4367 // TODO: Theoretically we could use op-sel to use the high bits of the
4368 // 32-bit FP values.
4386 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4387 // A few special case instructions have 16-bit operands on subtargets
4388 // where 16-bit instructions are not legal.
4389 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4390 // constants in these cases
4391 int16_t Trunc = static_cast<int16_t>(Imm);
4392 return ST.has16BitInsts() &&
4394 }
4395
4396 return false;
4397 }
4402 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4403 int16_t Trunc = static_cast<int16_t>(Imm);
4404 return ST.has16BitInsts() &&
4406 }
4407 return false;
4408 }
4411 return false;
4414 // Always embedded in the instruction for free.
4415 return true;
4425 // Just ignore anything else.
4426 return true;
4427 default:
4428 llvm_unreachable("invalid operand type");
4429 }
4430}
4431
4432static bool compareMachineOp(const MachineOperand &Op0,
4433 const MachineOperand &Op1) {
4434 if (Op0.getType() != Op1.getType())
4435 return false;
4436
4437 switch (Op0.getType()) {
4439 return Op0.getReg() == Op1.getReg();
4441 return Op0.getImm() == Op1.getImm();
4442 default:
4443 llvm_unreachable("Didn't expect to be comparing these operand types");
4444 }
4445}
4446
4448 const MachineOperand &MO) const {
4449 const MCInstrDesc &InstDesc = MI.getDesc();
4450 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4451
4452 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4453
4455 return true;
4456
4457 if (OpInfo.RegClass < 0)
4458 return false;
4459
4460 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4461 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4462 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4463 AMDGPU::OpName::src2))
4464 return false;
4465 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4466 }
4467
4468 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4469 return false;
4470
4471 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4472 return true;
4473
4474 return ST.hasVOP3Literal();
4475}
4476
4477bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4478 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4479 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4480 return false;
4481
4482 int Op32 = AMDGPU::getVOPe32(Opcode);
4483 if (Op32 == -1)
4484 return false;
4485
4486 return pseudoToMCOpcode(Op32) != -1;
4487}
4488
4489bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4490 // The src0_modifier operand is present on all instructions
4491 // that have modifiers.
4492
4493 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4494}
4495
4497 unsigned OpName) const {
4498 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4499 return Mods && Mods->getImm();
4500}
4501
4503 return any_of(ModifierOpNames,
4504 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4505}
4506
4508 const MachineRegisterInfo &MRI) const {
4509 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4510 // Can't shrink instruction with three operands.
4511 if (Src2) {
4512 switch (MI.getOpcode()) {
4513 default: return false;
4514
4515 case AMDGPU::V_ADDC_U32_e64:
4516 case AMDGPU::V_SUBB_U32_e64:
4517 case AMDGPU::V_SUBBREV_U32_e64: {
4518 const MachineOperand *Src1
4519 = getNamedOperand(MI, AMDGPU::OpName::src1);
4520 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4521 return false;
4522 // Additional verification is needed for sdst/src2.
4523 return true;
4524 }
4525 case AMDGPU::V_MAC_F16_e64:
4526 case AMDGPU::V_MAC_F32_e64:
4527 case AMDGPU::V_MAC_LEGACY_F32_e64:
4528 case AMDGPU::V_FMAC_F16_e64:
4529 case AMDGPU::V_FMAC_F16_fake16_e64:
4530 case AMDGPU::V_FMAC_F32_e64:
4531 case AMDGPU::V_FMAC_F64_e64:
4532 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4533 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4534 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4535 return false;
4536 break;
4537
4538 case AMDGPU::V_CNDMASK_B32_e64:
4539 break;
4540 }
4541 }
4542
4543 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4544 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4545 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4546 return false;
4547
4548 // We don't need to check src0, all input types are legal, so just make sure
4549 // src0 isn't using any modifiers.
4550 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4551 return false;
4552
4553 // Can it be shrunk to a valid 32 bit opcode?
4554 if (!hasVALU32BitEncoding(MI.getOpcode()))
4555 return false;
4556
4557 // Check output modifiers
4558 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4559 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4560 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4561 // TODO: Can we avoid checking bound_ctrl/fi here?
4562 // They are only used by permlane*_swap special case.
4563 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4564 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4565}
4566
4567// Set VCC operand with all flags from \p Orig, except for setting it as
4568// implicit.
4570 const MachineOperand &Orig) {
4571
4572 for (MachineOperand &Use : MI.implicit_operands()) {
4573 if (Use.isUse() &&
4574 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4575 Use.setIsUndef(Orig.isUndef());
4576 Use.setIsKill(Orig.isKill());
4577 return;
4578 }
4579 }
4580}
4581
4583 unsigned Op32) const {
4584 MachineBasicBlock *MBB = MI.getParent();
4585
4586 const MCInstrDesc &Op32Desc = get(Op32);
4587 MachineInstrBuilder Inst32 =
4588 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4589 .setMIFlags(MI.getFlags());
4590
4591 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4592 // For VOPC instructions, this is replaced by an implicit def of vcc.
4593
4594 // We assume the defs of the shrunk opcode are in the same order, and the
4595 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4596 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4597 Inst32.add(MI.getOperand(I));
4598
4599 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4600
4601 int Idx = MI.getNumExplicitDefs();
4602 for (const MachineOperand &Use : MI.explicit_uses()) {
4603 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4605 continue;
4606
4607 if (&Use == Src2) {
4608 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4609 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4610 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4611 // of vcc was already added during the initial BuildMI, but we
4612 // 1) may need to change vcc to vcc_lo to preserve the original register
4613 // 2) have to preserve the original flags.
4614 copyFlagsToImplicitVCC(*Inst32, *Src2);
4615 continue;
4616 }
4617 }
4618
4619 Inst32.add(Use);
4620 }
4621
4622 // FIXME: Losing implicit operands
4623 fixImplicitOperands(*Inst32);
4624 return Inst32;
4625}
4626
4628 const MachineOperand &MO,
4629 const MCOperandInfo &OpInfo) const {
4630 // Literal constants use the constant bus.
4631 if (!MO.isReg())
4632 return !isInlineConstant(MO, OpInfo);
4633
4634 if (!MO.isUse())
4635 return false;
4636
4637 if (MO.getReg().isVirtual())
4638 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4639
4640 // Null is free
4641 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4642 return false;
4643
4644 // SGPRs use the constant bus
4645 if (MO.isImplicit()) {
4646 return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
4647 MO.getReg() == AMDGPU::VCC_LO;
4648 }
4649 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4650 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4651}
4652
4654 for (const MachineOperand &MO : MI.implicit_operands()) {
4655 // We only care about reads.
4656 if (MO.isDef())
4657 continue;
4658
4659 switch (MO.getReg()) {
4660 case AMDGPU::VCC:
4661 case AMDGPU::VCC_LO:
4662 case AMDGPU::VCC_HI:
4663 case AMDGPU::M0:
4664 case AMDGPU::FLAT_SCR:
4665 return MO.getReg();
4666
4667 default:
4668 break;
4669 }
4670 }
4671
4672 return Register();
4673}
4674
4675static bool shouldReadExec(const MachineInstr &MI) {
4676 if (SIInstrInfo::isVALU(MI)) {
4677 switch (MI.getOpcode()) {
4678 case AMDGPU::V_READLANE_B32:
4679 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4680 case AMDGPU::V_WRITELANE_B32:
4681 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4682 return false;
4683 }
4684
4685 return true;
4686 }
4687
4688 if (MI.isPreISelOpcode() ||
4689 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4692 return false;
4693
4694 return true;
4695}
4696
4697static bool isRegOrFI(const MachineOperand &MO) {
4698 return MO.isReg() || MO.isFI();
4699}
4700
4701static bool isSubRegOf(const SIRegisterInfo &TRI,
4702 const MachineOperand &SuperVec,
4703 const MachineOperand &SubReg) {
4704 if (SubReg.getReg().isPhysical())
4705 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4706
4707 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4708 SubReg.getReg() == SuperVec.getReg();
4709}
4710
4711// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4712bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4713 const MachineRegisterInfo &MRI,
4714 StringRef &ErrInfo) const {
4715 Register DstReg = MI.getOperand(0).getReg();
4716 Register SrcReg = MI.getOperand(1).getReg();
4717 // This is a check for copy from vector register to SGPR
4718 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4719 ErrInfo = "illegal copy from vector register to SGPR";
4720 return false;
4721 }
4722 return true;
4723}
4724
4726 StringRef &ErrInfo) const {
4727 uint16_t Opcode = MI.getOpcode();
4728 const MachineFunction *MF = MI.getParent()->getParent();
4729 const MachineRegisterInfo &MRI = MF->getRegInfo();
4730
4731 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4732 // Find a better property to recognize the point where instruction selection
4733 // is just done.
4734 // We can only enforce this check after SIFixSGPRCopies pass so that the
4735 // illegal copies are legalized and thereafter we don't expect a pass
4736 // inserting similar copies.
4737 if (!MRI.isSSA() && MI.isCopy())
4738 return verifyCopy(MI, MRI, ErrInfo);
4739
4740 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4741 return true;
4742
4743 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4744 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4745 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4746 int Src3Idx = -1;
4747 if (Src0Idx == -1) {
4748 // VOPD V_DUAL_* instructions use different operand names.
4749 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4750 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4751 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4752 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4753 }
4754
4755 // Make sure the number of operands is correct.
4756 const MCInstrDesc &Desc = get(Opcode);
4757 if (!Desc.isVariadic() &&
4758 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4759 ErrInfo = "Instruction has wrong number of operands.";
4760 return false;
4761 }
4762
4763 if (MI.isInlineAsm()) {
4764 // Verify register classes for inlineasm constraints.
4765 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4766 I != E; ++I) {
4767 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4768 if (!RC)
4769 continue;
4770
4771 const MachineOperand &Op = MI.getOperand(I);
4772 if (!Op.isReg())
4773 continue;
4774
4775 Register Reg = Op.getReg();
4776 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4777 ErrInfo = "inlineasm operand has incorrect register class.";
4778 return false;
4779 }
4780 }
4781
4782 return true;
4783 }
4784
4785 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4786 ErrInfo = "missing memory operand from image instruction.";
4787 return false;
4788 }
4789
4790 // Make sure the register classes are correct.
4791 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4792 const MachineOperand &MO = MI.getOperand(i);
4793 if (MO.isFPImm()) {
4794 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4795 "all fp values to integers.";
4796 return false;
4797 }
4798
4799 int RegClass = Desc.operands()[i].RegClass;
4800
4801 switch (Desc.operands()[i].OperandType) {
4803 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4804 ErrInfo = "Illegal immediate value for operand.";
4805 return false;
4806 }
4807 break;
4812 break;
4824 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4825 ErrInfo = "Illegal immediate value for operand.";
4826 return false;
4827 }
4828 break;
4829 }
4831 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4832 ErrInfo = "Expected inline constant for operand.";
4833 return false;
4834 }
4835 break;
4838 // Check if this operand is an immediate.
4839 // FrameIndex operands will be replaced by immediates, so they are
4840 // allowed.
4841 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4842 ErrInfo = "Expected immediate, but got non-immediate";
4843 return false;
4844 }
4845 [[fallthrough]];
4846 default:
4847 continue;
4848 }
4849
4850 if (!MO.isReg())
4851 continue;
4852 Register Reg = MO.getReg();
4853 if (!Reg)
4854 continue;
4855
4856 // FIXME: Ideally we would have separate instruction definitions with the
4857 // aligned register constraint.
4858 // FIXME: We do not verify inline asm operands, but custom inline asm
4859 // verification is broken anyway
4860 if (ST.needsAlignedVGPRs()) {
4861 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4862 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4863 if (const TargetRegisterClass *SubRC =
4864 RI.getSubRegisterClass(RC, MO.getSubReg())) {
4865 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4866 if (RC)
4867 RC = SubRC;
4868 }
4869 }
4870
4871 // Check that this is the aligned version of the class.
4872 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4873 ErrInfo = "Subtarget requires even aligned vector registers";
4874 return false;
4875 }
4876 }
4877
4878 if (RegClass != -1) {
4879 if (Reg.isVirtual())
4880 continue;
4881
4882 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4883 if (!RC->contains(Reg)) {
4884 ErrInfo = "Operand has incorrect register class.";
4885 return false;
4886 }
4887 }
4888 }
4889
4890 // Verify SDWA
4891 if (isSDWA(MI)) {
4892 if (!ST.hasSDWA()) {
4893 ErrInfo = "SDWA is not supported on this target";
4894 return false;
4895 }
4896
4897 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4898
4899 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4900 if (OpIdx == -1)
4901 continue;
4902 const MachineOperand &MO = MI.getOperand(OpIdx);
4903
4904 if (!ST.hasSDWAScalar()) {
4905 // Only VGPRS on VI
4906 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4907 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4908 return false;
4909 }
4910 } else {
4911 // No immediates on GFX9
4912 if (!MO.isReg()) {
4913 ErrInfo =
4914 "Only reg allowed as operands in SDWA instructions on GFX9+";
4915 return false;
4916 }
4917 }
4918 }
4919
4920 if (!ST.hasSDWAOmod()) {
4921 // No omod allowed on VI
4922 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4923 if (OMod != nullptr &&
4924 (!OMod->isImm() || OMod->getImm() != 0)) {
4925 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4926 return false;
4927 }
4928 }
4929
4930 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4931 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4932 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4933 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4934 const MachineOperand *Src0ModsMO =
4935 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4936 unsigned Mods = Src0ModsMO->getImm();
4937 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4938 Mods & SISrcMods::SEXT) {
4939 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4940 return false;
4941 }
4942 }
4943
4944 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4945 if (isVOPC(BasicOpcode)) {
4946 if (!ST.hasSDWASdst() && DstIdx != -1) {
4947 // Only vcc allowed as dst on VI for VOPC
4948 const MachineOperand &Dst = MI.getOperand(DstIdx);
4949 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4950 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4951 return false;
4952 }
4953 } else if (!ST.hasSDWAOutModsVOPC()) {
4954 // No clamp allowed on GFX9 for VOPC
4955 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4956 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4957 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4958 return false;
4959 }
4960
4961 // No omod allowed on GFX9 for VOPC
4962 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4963 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4964 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4965 return false;
4966 }
4967 }
4968 }
4969
4970 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4971 if (DstUnused && DstUnused->isImm() &&
4972 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4973 const MachineOperand &Dst = MI.getOperand(DstIdx);
4974 if (!Dst.isReg() || !Dst.isTied()) {
4975 ErrInfo = "Dst register should have tied register";
4976 return false;
4977 }
4978
4979 const MachineOperand &TiedMO =
4980 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4981 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4982 ErrInfo =
4983 "Dst register should be tied to implicit use of preserved register";
4984 return false;
4985 }
4986 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
4987 ErrInfo = "Dst register should use same physical register as preserved";
4988 return false;
4989 }
4990 }
4991 }
4992
4993 // Verify MIMG / VIMAGE / VSAMPLE
4994 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4995 // Ensure that the return type used is large enough for all the options
4996 // being used TFE/LWE require an extra result register.
4997 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4998 if (DMask) {
4999 uint64_t DMaskImm = DMask->getImm();
5000 uint32_t RegCount =
5001 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
5002 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5003 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5004 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5005
5006 // Adjust for packed 16 bit values
5007 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5008 RegCount = divideCeil(RegCount, 2);
5009
5010 // Adjust if using LWE or TFE
5011 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5012 RegCount += 1;
5013
5014 const uint32_t DstIdx =
5015 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
5016 const MachineOperand &Dst = MI.getOperand(DstIdx);
5017 if (Dst.isReg()) {
5018 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5019 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5020 if (RegCount > DstSize) {
5021 ErrInfo = "Image instruction returns too many registers for dst "
5022 "register class";
5023 return false;
5024 }
5025 }
5026 }
5027 }
5028
5029 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5030 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5031 unsigned ConstantBusCount = 0;
5032 bool UsesLiteral = false;
5033 const MachineOperand *LiteralVal = nullptr;
5034
5035 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5036 if (ImmIdx != -1) {
5037 ++ConstantBusCount;
5038 UsesLiteral = true;
5039 LiteralVal = &MI.getOperand(ImmIdx);
5040 }
5041
5042 SmallVector<Register, 2> SGPRsUsed;
5043 Register SGPRUsed;
5044
5045 // Only look at the true operands. Only a real operand can use the constant
5046 // bus, and we don't want to check pseudo-operands like the source modifier
5047 // flags.
5048 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5049 if (OpIdx == -1)
5050 continue;
5051 const MachineOperand &MO = MI.getOperand(OpIdx);
5052 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5053 if (MO.isReg()) {
5054 SGPRUsed = MO.getReg();
5055 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5056 ++ConstantBusCount;
5057 SGPRsUsed.push_back(SGPRUsed);
5058 }
5059 } else if (!MO.isFI()) { // Treat FI like a register.
5060 if (!UsesLiteral) {
5061 ++ConstantBusCount;
5062 UsesLiteral = true;
5063 LiteralVal = &MO;
5064 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5065 assert(isVOP2(MI) || isVOP3(MI));
5066 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5067 return false;
5068 }
5069 }
5070 }
5071 }
5072
5073 SGPRUsed = findImplicitSGPRRead(MI);
5074 if (SGPRUsed) {
5075 // Implicit uses may safely overlap true operands
5076 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5077 return !RI.regsOverlap(SGPRUsed, SGPR);
5078 })) {
5079 ++ConstantBusCount;
5080 SGPRsUsed.push_back(SGPRUsed);
5081 }
5082 }
5083
5084 // v_writelane_b32 is an exception from constant bus restriction:
5085 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5086 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5087 Opcode != AMDGPU::V_WRITELANE_B32) {
5088 ErrInfo = "VOP* instruction violates constant bus restriction";
5089 return false;
5090 }
5091
5092 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5093 ErrInfo = "VOP3 instruction uses literal";
5094 return false;
5095 }
5096 }
5097
5098 // Special case for writelane - this can break the multiple constant bus rule,
5099 // but still can't use more than one SGPR register
5100 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5101 unsigned SGPRCount = 0;
5102 Register SGPRUsed;
5103
5104 for (int OpIdx : {Src0Idx, Src1Idx}) {
5105 if (OpIdx == -1)
5106 break;
5107
5108 const MachineOperand &MO = MI.getOperand(OpIdx);
5109
5110 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5111 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5112 if (MO.getReg() != SGPRUsed)
5113 ++SGPRCount;
5114 SGPRUsed = MO.getReg();
5115 }
5116 }
5117 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5118 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5119 return false;
5120 }
5121 }
5122 }
5123
5124 // Verify misc. restrictions on specific instructions.
5125 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5126 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5127 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5128 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5129 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5130 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5131 if (!compareMachineOp(Src0, Src1) &&
5132 !compareMachineOp(Src0, Src2)) {
5133 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5134 return false;
5135 }
5136 }
5137 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5138 SISrcMods::ABS) ||
5139 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5140 SISrcMods::ABS) ||
5141 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5142 SISrcMods::ABS)) {
5143 ErrInfo = "ABS not allowed in VOP3B instructions";
5144 return false;
5145 }
5146 }
5147
5148 if (isSOP2(MI) || isSOPC(MI)) {
5149 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5150 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5151
5152 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5153 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5154 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5155 !Src0.isIdenticalTo(Src1)) {
5156 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5157 return false;
5158 }
5159 }
5160
5161 if (isSOPK(MI)) {
5162 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5163 if (Desc.isBranch()) {
5164 if (!Op->isMBB()) {
5165 ErrInfo = "invalid branch target for SOPK instruction";
5166 return false;
5167 }
5168 } else {
5169 uint64_t Imm = Op->getImm();
5170 if (sopkIsZext(Opcode)) {
5171 if (!isUInt<16>(Imm)) {
5172 ErrInfo = "invalid immediate for SOPK instruction";
5173 return false;
5174 }
5175 } else {
5176 if (!isInt<16>(Imm)) {
5177 ErrInfo = "invalid immediate for SOPK instruction";
5178 return false;
5179 }
5180 }
5181 }
5182 }
5183
5184 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5185 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5186 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5187 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5188 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5189 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5190
5191 const unsigned StaticNumOps =
5192 Desc.getNumOperands() + Desc.implicit_uses().size();
5193 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5194
5195 // Allow additional implicit operands. This allows a fixup done by the post
5196 // RA scheduler where the main implicit operand is killed and implicit-defs
5197 // are added for sub-registers that remain live after this instruction.
5198 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5199 ErrInfo = "missing implicit register operands";
5200 return false;
5201 }
5202
5203 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5204 if (IsDst) {
5205 if (!Dst->isUse()) {
5206 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5207 return false;
5208 }
5209
5210 unsigned UseOpIdx;
5211 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5212 UseOpIdx != StaticNumOps + 1) {
5213 ErrInfo = "movrel implicit operands should be tied";
5214 return false;
5215 }
5216 }
5217
5218 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5219 const MachineOperand &ImpUse
5220 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5221 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5222 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5223 ErrInfo = "src0 should be subreg of implicit vector use";
5224 return false;
5225 }
5226 }
5227
5228 // Make sure we aren't losing exec uses in the td files. This mostly requires
5229 // being careful when using let Uses to try to add other use registers.
5230 if (shouldReadExec(MI)) {
5231 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5232 ErrInfo = "VALU instruction does not implicitly read exec mask";
5233 return false;
5234 }
5235 }
5236
5237 if (isSMRD(MI)) {
5238 if (MI.mayStore() &&
5240 // The register offset form of scalar stores may only use m0 as the
5241 // soffset register.
5242 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5243 if (Soff && Soff->getReg() != AMDGPU::M0) {
5244 ErrInfo = "scalar stores must use m0 as offset register";
5245 return false;
5246 }
5247 }
5248 }
5249
5250 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5251 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5252 if (Offset->getImm() != 0) {
5253 ErrInfo = "subtarget does not support offsets in flat instructions";
5254 return false;
5255 }
5256 }
5257
5258 if (isDS(MI) && !ST.hasGDS()) {
5259 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5260 if (GDSOp && GDSOp->getImm() != 0) {
5261 ErrInfo = "GDS is not supported on this subtarget";
5262 return false;
5263 }
5264 }
5265
5266 if (isImage(MI)) {
5267 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5268 if (DimOp) {
5269 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5270 AMDGPU::OpName::vaddr0);
5271 int RSrcOpName =
5272 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5273 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5274 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5275 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5277 const AMDGPU::MIMGDimInfo *Dim =
5279
5280 if (!Dim) {
5281 ErrInfo = "dim is out of range";
5282 return false;
5283 }
5284
5285 bool IsA16 = false;
5286 if (ST.hasR128A16()) {
5287 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5288 IsA16 = R128A16->getImm() != 0;
5289 } else if (ST.hasA16()) {
5290 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5291 IsA16 = A16->getImm() != 0;
5292 }
5293
5294 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5295
5296 unsigned AddrWords =
5297 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5298
5299 unsigned VAddrWords;
5300 if (IsNSA) {
5301 VAddrWords = RsrcIdx - VAddr0Idx;
5302 if (ST.hasPartialNSAEncoding() &&
5303 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5304 unsigned LastVAddrIdx = RsrcIdx - 1;
5305 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5306 }
5307 } else {
5308 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5309 if (AddrWords > 12)
5310 AddrWords = 16;
5311 }
5312
5313 if (VAddrWords != AddrWords) {
5314 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5315 << " but got " << VAddrWords << "\n");
5316 ErrInfo = "bad vaddr size";
5317 return false;
5318 }
5319 }
5320 }
5321
5322 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5323 if (DppCt) {
5324 using namespace AMDGPU::DPP;
5325
5326 unsigned DC = DppCt->getImm();
5327 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5328 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5329 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5330 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5331 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5332 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5333 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5334 ErrInfo = "Invalid dpp_ctrl value";
5335 return false;
5336 }
5337 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5339 ErrInfo = "Invalid dpp_ctrl value: "
5340 "wavefront shifts are not supported on GFX10+";
5341 return false;
5342 }
5343 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5345 ErrInfo = "Invalid dpp_ctrl value: "
5346 "broadcasts are not supported on GFX10+";
5347 return false;
5348 }
5349 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5351 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5352 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5353 !ST.hasGFX90AInsts()) {
5354 ErrInfo = "Invalid dpp_ctrl value: "
5355 "row_newbroadcast/row_share is not supported before "
5356 "GFX90A/GFX10";
5357 return false;
5358 }
5359 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5360 ErrInfo = "Invalid dpp_ctrl value: "
5361 "row_share and row_xmask are not supported before GFX10";
5362 return false;
5363 }
5364 }
5365
5366 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5368 ErrInfo = "Invalid dpp_ctrl value: "
5369 "DP ALU dpp only support row_newbcast";
5370 return false;
5371 }
5372 }
5373
5374 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5375 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5376 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5377 : AMDGPU::OpName::vdata;
5378 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5379 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5380 if (Data && !Data->isReg())
5381 Data = nullptr;
5382
5383 if (ST.hasGFX90AInsts()) {
5384 if (Dst && Data &&
5385 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5386 ErrInfo = "Invalid register class: "
5387 "vdata and vdst should be both VGPR or AGPR";
5388 return false;
5389 }
5390 if (Data && Data2 &&
5391 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5392 ErrInfo = "Invalid register class: "
5393 "both data operands should be VGPR or AGPR";
5394 return false;
5395 }
5396 } else {
5397 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5398 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5399 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5400 ErrInfo = "Invalid register class: "
5401 "agpr loads and stores not supported on this GPU";
5402 return false;
5403 }
5404 }
5405 }
5406
5407 if (ST.needsAlignedVGPRs()) {
5408 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5410 if (!Op)
5411 return true;
5412 Register Reg = Op->getReg();
5413 if (Reg.isPhysical())
5414 return !(RI.getHWRegIndex(Reg) & 1);
5415 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5416 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5417 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5418 };
5419
5420 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5421 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5422 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5423
5424 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5425 ErrInfo = "Subtarget requires even aligned vector registers "
5426 "for DS_GWS instructions";
5427 return false;
5428 }
5429 }
5430
5431 if (isMIMG(MI)) {
5432 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5433 ErrInfo = "Subtarget requires even aligned vector registers "
5434 "for vaddr operand of image instructions";
5435 return false;
5436 }
5437 }
5438 }
5439
5440 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5441 !ST.hasGFX90AInsts()) {
5442 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5443 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5444 ErrInfo = "Invalid register class: "
5445 "v_accvgpr_write with an SGPR is not supported on this GPU";
5446 return false;
5447 }
5448 }
5449
5450 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5451 const MachineOperand &SrcOp = MI.getOperand(1);
5452 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5453 ErrInfo = "pseudo expects only physical SGPRs";
5454 return false;
5455 }
5456 }
5457
5458 return true;
5459}
5460
5461// It is more readable to list mapped opcodes on the same line.
5462// clang-format off
5463
5465 switch (MI.getOpcode()) {
5466 default: return AMDGPU::INSTRUCTION_LIST_END;
5467 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5468 case AMDGPU::COPY: return AMDGPU::COPY;
5469 case AMDGPU::PHI: return AMDGPU::PHI;
5470 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5471 case AMDGPU::WQM: return AMDGPU::WQM;
5472 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5473 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5474 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5475 case AMDGPU::S_MOV_B32: {
5476 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5477 return MI.getOperand(1).isReg() ||
5478 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5479 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5480 }
5481 case AMDGPU::S_ADD_I32:
5482 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5483 case AMDGPU::S_ADDC_U32:
5484 return AMDGPU::V_ADDC_U32_e32;
5485 case AMDGPU::S_SUB_I32:
5486 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5487 // FIXME: These are not consistently handled, and selected when the carry is
5488 // used.
5489 case AMDGPU::S_ADD_U32:
5490 return AMDGPU::V_ADD_CO_U32_e32;
5491 case AMDGPU::S_SUB_U32:
5492 return AMDGPU::V_SUB_CO_U32_e32;
5493 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5494 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5495 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5496 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5497 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5498 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5499 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5500 case AMDGPU::S_XNOR_B32:
5501 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5502 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5503 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5504 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5505 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5506 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5507 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5508 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5509 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5510 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5511 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5512 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5513 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5514 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5515 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5516 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5517 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5518 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5519 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5520 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5521 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5522 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5523 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5524 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5525 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5526 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5527 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5528 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5529 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5530 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5531 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5532 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5533 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5534 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5535 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5536 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5537 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5538 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5539 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5540 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5541 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5542 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5543 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5544 case AMDGPU::S_CVT_F32_F16:
5545 case AMDGPU::S_CVT_HI_F32_F16:
5546 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5547 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5548 case AMDGPU::S_CVT_F16_F32:
5549 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5550 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5551 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5552 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5553 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5554 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5555 case AMDGPU::S_CEIL_F16:
5556 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5557 : AMDGPU::V_CEIL_F16_fake16_e64;
5558 case AMDGPU::S_FLOOR_F16:
5559 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5560 : AMDGPU::V_FLOOR_F16_fake16_e64;
5561 case AMDGPU::S_TRUNC_F16:
5562 return AMDGPU::V_TRUNC_F16_fake16_e64;
5563 case AMDGPU::S_RNDNE_F16:
5564 return AMDGPU::V_RNDNE_F16_fake16_e64;
5565 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5566 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5567 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5568 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5569 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5570 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5571 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5572 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5573 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5574 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5575 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5576 case AMDGPU::S_MINIMUM_F16:
5577 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5578 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5579 case AMDGPU::S_MAXIMUM_F16:
5580 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5581 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5582 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5583 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5584 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5585 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5586 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5587 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5588 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5589 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5590 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5591 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5592 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5593 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5594 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5595 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5596 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5597 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5598 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5599 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5600 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5601 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5602 case AMDGPU::S_CMP_LT_F16:
5603 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5604 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5605 case AMDGPU::S_CMP_EQ_F16:
5606 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5607 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5608 case AMDGPU::S_CMP_LE_F16:
5609 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5610 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5611 case AMDGPU::S_CMP_GT_F16:
5612 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5613 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5614 case AMDGPU::S_CMP_LG_F16:
5615 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5616 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5617 case AMDGPU::S_CMP_GE_F16:
5618 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5619 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5620 case AMDGPU::S_CMP_O_F16:
5621 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5622 : AMDGPU::V_CMP_O_F16_fake16_e64;
5623 case AMDGPU::S_CMP_U_F16:
5624 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5625 : AMDGPU::V_CMP_U_F16_fake16_e64;
5626 case AMDGPU::S_CMP_NGE_F16:
5627 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5628 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5629 case AMDGPU::S_CMP_NLG_F16:
5630 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5631 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5632 case AMDGPU::S_CMP_NGT_F16:
5633 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5634 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5635 case AMDGPU::S_CMP_NLE_F16:
5636 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5637 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5638 case AMDGPU::S_CMP_NEQ_F16:
5639 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5640 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5641 case AMDGPU::S_CMP_NLT_F16:
5642 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5643 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5644 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5645 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5646 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5647 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5648 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5649 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5650 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5651 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5652 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5653 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5654 }
5656 "Unexpected scalar opcode without corresponding vector one!");
5657}
5658
5659// clang-format on
5660
5664 const DebugLoc &DL, Register Reg,
5665 bool IsSCCLive,
5666 SlotIndexes *Indexes) const {
5667 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5668 const SIInstrInfo *TII = ST.getInstrInfo();
5669 bool IsWave32 = ST.isWave32();
5670 if (IsSCCLive) {
5671 // Insert two move instructions, one to save the original value of EXEC and
5672 // the other to turn on all bits in EXEC. This is required as we can't use
5673 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5674 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5675 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5676 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5677 .addReg(Exec, RegState::Kill);
5678 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5679 if (Indexes) {
5680 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5681 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5682 }
5683 } else {
5684 const unsigned OrSaveExec =
5685 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5686 auto SaveExec =
5687 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5688 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5689 if (Indexes)
5690 Indexes->insertMachineInstrInMaps(*SaveExec);
5691 }
5692}
5693
5696 const DebugLoc &DL, Register Reg,
5697 SlotIndexes *Indexes) const {
5698 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5699 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5700 auto ExecRestoreMI =
5701 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5702 if (Indexes)
5703 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5704}
5705
5706static const TargetRegisterClass *
5708 const MachineRegisterInfo &MRI,
5709 const MCInstrDesc &TID, unsigned RCID,
5710 bool IsAllocatable) {
5711 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5712 (((TID.mayLoad() || TID.mayStore()) &&
5713 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5715 switch (RCID) {
5716 case AMDGPU::AV_32RegClassID:
5717 RCID = AMDGPU::VGPR_32RegClassID;
5718 break;
5719 case AMDGPU::AV_64RegClassID:
5720 RCID = AMDGPU::VReg_64RegClassID;
5721 break;
5722 case AMDGPU::AV_96RegClassID:
5723 RCID = AMDGPU::VReg_96RegClassID;
5724 break;
5725 case AMDGPU::AV_128RegClassID:
5726 RCID = AMDGPU::VReg_128RegClassID;
5727 break;
5728 case AMDGPU::AV_160RegClassID:
5729 RCID = AMDGPU::VReg_160RegClassID;
5730 break;
5731 case AMDGPU::AV_512RegClassID:
5732 RCID = AMDGPU::VReg_512RegClassID;
5733 break;
5734 default:
5735 break;
5736 }
5737 }
5738
5739 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5740}
5741
5743 unsigned OpNum, const TargetRegisterInfo *TRI,
5744 const MachineFunction &MF)
5745 const {
5746 if (OpNum >= TID.getNumOperands())
5747 return nullptr;
5748 auto RegClass = TID.operands()[OpNum].RegClass;
5749 bool IsAllocatable = false;
5751 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5752 // with two data operands. Request register class constrained to VGPR only
5753 // of both operands present as Machine Copy Propagation can not check this
5754 // constraint and possibly other passes too.
5755 //
5756 // The check is limited to FLAT and DS because atomics in non-flat encoding
5757 // have their vdst and vdata tied to be the same register.
5758 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5759 AMDGPU::OpName::vdst);
5760 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5761 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5762 : AMDGPU::OpName::vdata);
5763 if (DataIdx != -1) {
5764 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5765 TID.Opcode, AMDGPU::OpName::data1);
5766 }
5767 }
5768 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5769 IsAllocatable);
5770}
5771
5773 unsigned OpNo) const {
5774 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5775 const MCInstrDesc &Desc = get(MI.getOpcode());
5776 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5777 Desc.operands()[OpNo].RegClass == -1) {
5778 Register Reg = MI.getOperand(OpNo).getReg();
5779
5780 if (Reg.isVirtual())
5781 return MRI.getRegClass(Reg);
5782 return RI.getPhysRegBaseClass(Reg);
5783 }
5784
5785 unsigned RCID = Desc.operands()[OpNo].RegClass;
5786 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5787}
5788
5791 MachineBasicBlock *MBB = MI.getParent();
5792 MachineOperand &MO = MI.getOperand(OpIdx);
5794 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5795 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5796 unsigned Size = RI.getRegSizeInBits(*RC);
5797 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
5798 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
5799 : AMDGPU::V_MOV_B32_e32;
5800 if (MO.isReg())
5801 Opcode = AMDGPU::COPY;
5802 else if (RI.isSGPRClass(RC))
5803 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5804
5805 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5806 Register Reg = MRI.createVirtualRegister(VRC);
5808 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5809 MO.ChangeToRegister(Reg, false);
5810}
5811
5814 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5815 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5816 if (!SuperReg.getReg().isVirtual())
5817 return RI.getSubReg(SuperReg.getReg(), SubIdx);
5818
5819 MachineBasicBlock *MBB = MI->getParent();
5820 DebugLoc DL = MI->getDebugLoc();
5821 Register SubReg = MRI.createVirtualRegister(SubRC);
5822
5823 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
5824 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5825 .addReg(SuperReg.getReg(), 0, NewSubIdx);
5826 return SubReg;
5827}
5828
5831 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5832 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5833 if (Op.isImm()) {
5834 if (SubIdx == AMDGPU::sub0)
5835 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5836 if (SubIdx == AMDGPU::sub1)
5837 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5838
5839 llvm_unreachable("Unhandled register index for immediate");
5840 }
5841
5842 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5843 SubIdx, SubRC);
5844 return MachineOperand::CreateReg(SubReg, false);
5845}
5846
5847// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5848void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5849 assert(Inst.getNumExplicitOperands() == 3);
5850 MachineOperand Op1 = Inst.getOperand(1);
5851 Inst.removeOperand(1);
5852 Inst.addOperand(Op1);
5853}
5854
5856 const MCOperandInfo &OpInfo,
5857 const MachineOperand &MO) const {
5858 if (!MO.isReg())
5859 return false;
5860
5861 Register Reg = MO.getReg();
5862
5863 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5864 if (Reg.isPhysical())
5865 return DRC->contains(Reg);
5866
5867 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5868
5869 if (MO.getSubReg()) {
5870 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5871 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5872 if (!SuperRC)
5873 return false;
5874
5875 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5876 if (!DRC)
5877 return false;
5878 }
5879 return RC->hasSuperClassEq(DRC);
5880}
5881
5883 const MachineOperand &MO) const {
5884 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5885 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
5886 unsigned Opc = MI.getOpcode();
5887
5888 if (!isLegalRegOperand(MRI, OpInfo, MO))
5889 return false;
5890
5891 // check Accumulate GPR operand
5892 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
5893 if (IsAGPR && !ST.hasMAIInsts())
5894 return false;
5895 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5896 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5897 return false;
5898 // Atomics should have both vdst and vdata either vgpr or agpr.
5899 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5900 const int DataIdx = AMDGPU::getNamedOperandIdx(
5901 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5902 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5903 MI.getOperand(DataIdx).isReg() &&
5904 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5905 return false;
5906 if ((int)OpIdx == DataIdx) {
5907 if (VDstIdx != -1 &&
5908 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5909 return false;
5910 // DS instructions with 2 src operands also must have tied RC.
5911 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
5912 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5913 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5914 return false;
5915 }
5916
5917 // Check V_ACCVGPR_WRITE_B32_e64
5918 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5919 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5920 RI.isSGPRReg(MRI, MO.getReg()))
5921 return false;
5922 return true;
5923}
5924
5926 const MCOperandInfo &OpInfo,
5927 const MachineOperand &MO) const {
5928 if (MO.isReg())
5929 return isLegalRegOperand(MRI, OpInfo, MO);
5930
5931 // Handle non-register types that are treated like immediates.
5932 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5933 return true;
5934}
5935
5936bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5937 const MachineOperand *MO) const {
5938 const MachineFunction &MF = *MI.getParent()->getParent();
5939 const MachineRegisterInfo &MRI = MF.getRegInfo();
5940 const MCInstrDesc &InstDesc = MI.getDesc();
5941 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5942 const TargetRegisterClass *DefinedRC =
5943 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5944 if (!MO)
5945 MO = &MI.getOperand(OpIdx);
5946
5947 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5948 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5949 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5950 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5951 return false;
5952
5954 if (MO->isReg())
5955 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5956
5957 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5958 if (i == OpIdx)
5959 continue;
5960 const MachineOperand &Op = MI.getOperand(i);
5961 if (Op.isReg()) {
5962 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5963 if (!SGPRsUsed.count(SGPR) &&
5964 // FIXME: This can access off the end of the operands() array.
5965 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5966 if (--ConstantBusLimit <= 0)
5967 return false;
5968 SGPRsUsed.insert(SGPR);
5969 }
5970 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5971 !isInlineConstant(Op, InstDesc.operands()[i])) {
5972 if (!LiteralLimit--)
5973 return false;
5974 if (--ConstantBusLimit <= 0)
5975 return false;
5976 }
5977 }
5978 } else if (ST.hasNoF16PseudoScalarTransInlineConstants() && !MO->isReg() &&
5979 isF16PseudoScalarTrans(MI.getOpcode()) &&
5980 isInlineConstant(*MO, OpInfo)) {
5981 return false;
5982 }
5983
5984 if (MO->isReg()) {
5985 if (!DefinedRC)
5986 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5987 return isLegalRegOperand(MI, OpIdx, *MO);
5988 }
5989
5990 if (MO->isImm()) {
5991 uint64_t Imm = MO->getImm();
5992 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5993 bool Is64BitOp = Is64BitFPOp ||
5997 if (Is64BitOp &&
5999 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
6000 return false;
6001
6002 // FIXME: We can use sign extended 64-bit literals, but only for signed
6003 // operands. At the moment we do not know if an operand is signed.
6004 // Such operand will be encoded as its low 32 bits and then either
6005 // correctly sign extended or incorrectly zero extended by HW.
6006 if (!Is64BitFPOp && (int32_t)Imm < 0)
6007 return false;
6008 }
6009 }
6010
6011 // Handle non-register types that are treated like immediates.
6012 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6013
6014 if (!DefinedRC) {
6015 // This operand expects an immediate.
6016 return true;
6017 }
6018
6019 return isImmOperandLegal(MI, OpIdx, *MO);
6020}
6021
6023 MachineInstr &MI) const {
6024 unsigned Opc = MI.getOpcode();
6025 const MCInstrDesc &InstrDesc = get(Opc);
6026
6027 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6028 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6029
6030 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6031 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6032
6033 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6034 // we need to only have one constant bus use before GFX10.
6035 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6036 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6037 RI.isSGPRReg(MRI, Src0.getReg()))
6038 legalizeOpWithMove(MI, Src0Idx);
6039
6040 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6041 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6042 // src0/src1 with V_READFIRSTLANE.
6043 if (Opc == AMDGPU::V_WRITELANE_B32) {
6044 const DebugLoc &DL = MI.getDebugLoc();
6045 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6046 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6047 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6048 .add(Src0);
6049 Src0.ChangeToRegister(Reg, false);
6050 }
6051 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6052 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6053 const DebugLoc &DL = MI.getDebugLoc();
6054 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6055 .add(Src1);
6056 Src1.ChangeToRegister(Reg, false);
6057 }
6058 return;
6059 }
6060
6061 // No VOP2 instructions support AGPRs.
6062 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
6063 legalizeOpWithMove(MI, Src0Idx);
6064
6065 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
6066 legalizeOpWithMove(MI, Src1Idx);
6067
6068 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6069 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6070 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6071 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6072 legalizeOpWithMove(MI, Src2Idx);
6073 }
6074
6075 // VOP2 src0 instructions support all operand types, so we don't need to check
6076 // their legality. If src1 is already legal, we don't need to do anything.
6077 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6078 return;
6079
6080 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6081 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6082 // select is uniform.
6083 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6084 RI.isVGPR(MRI, Src1.getReg())) {
6085 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6086 const DebugLoc &DL = MI.getDebugLoc();
6087 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6088 .add(Src1);
6089 Src1.ChangeToRegister(Reg, false);
6090 return;
6091 }
6092
6093 // We do not use commuteInstruction here because it is too aggressive and will
6094 // commute if it is possible. We only want to commute here if it improves
6095 // legality. This can be called a fairly large number of times so don't waste
6096 // compile time pointlessly swapping and checking legality again.
6097 if (HasImplicitSGPR || !MI.isCommutable()) {
6098 legalizeOpWithMove(MI, Src1Idx);
6099 return;
6100 }
6101
6102 // If src0 can be used as src1, commuting will make the operands legal.
6103 // Otherwise we have to give up and insert a move.
6104 //
6105 // TODO: Other immediate-like operand kinds could be commuted if there was a
6106 // MachineOperand::ChangeTo* for them.
6107 if ((!Src1.isImm() && !Src1.isReg()) ||
6108 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6109 legalizeOpWithMove(MI, Src1Idx);
6110 return;
6111 }
6112
6113 int CommutedOpc = commuteOpcode(MI);
6114 if (CommutedOpc == -1) {
6115 legalizeOpWithMove(MI, Src1Idx);
6116 return;
6117 }
6118
6119 MI.setDesc(get(CommutedOpc));
6120
6121 Register Src0Reg = Src0.getReg();
6122 unsigned Src0SubReg = Src0.getSubReg();
6123 bool Src0Kill = Src0.isKill();
6124
6125 if (Src1.isImm())
6126 Src0.ChangeToImmediate(Src1.getImm());
6127 else if (Src1.isReg()) {
6128 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6129 Src0.setSubReg(Src1.getSubReg());
6130 } else
6131 llvm_unreachable("Should only have register or immediate operands");
6132
6133 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6134 Src1.setSubReg(Src0SubReg);
6136}
6137
6138// Legalize VOP3 operands. All operand types are supported for any operand
6139// but only one literal constant and only starting from GFX10.
6141 MachineInstr &MI) const {
6142 unsigned Opc = MI.getOpcode();
6143
6144 int VOP3Idx[3] = {
6145 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6146 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6147 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6148 };
6149
6150 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6151 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
6152 // src1 and src2 must be scalar
6153 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6154 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6155 const DebugLoc &DL = MI.getDebugLoc();
6156 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6157 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6158 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6159 .add(Src1);
6160 Src1.ChangeToRegister(Reg, false);
6161 }
6162 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6163 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6164 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6165 .add(Src2);
6166 Src2.ChangeToRegister(Reg, false);
6167 }
6168 }
6169
6170 // Find the one SGPR operand we are allowed to use.
6171 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6172 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6173 SmallDenseSet<unsigned> SGPRsUsed;
6174 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6175 if (SGPRReg) {
6176 SGPRsUsed.insert(SGPRReg);
6177 --ConstantBusLimit;
6178 }
6179
6180 for (int Idx : VOP3Idx) {
6181 if (Idx == -1)
6182 break;
6183 MachineOperand &MO = MI.getOperand(Idx);
6184
6185 if (!MO.isReg()) {
6186 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6187 continue;
6188
6189 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6190 --LiteralLimit;
6191 --ConstantBusLimit;
6192 continue;
6193 }
6194
6195 --LiteralLimit;
6196 --ConstantBusLimit;
6198 continue;
6199 }
6200
6201 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6202 !isOperandLegal(MI, Idx, &MO)) {
6204 continue;
6205 }
6206
6207 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6208 continue; // VGPRs are legal
6209
6210 // We can use one SGPR in each VOP3 instruction prior to GFX10
6211 // and two starting from GFX10.
6212 if (SGPRsUsed.count(MO.getReg()))
6213 continue;
6214 if (ConstantBusLimit > 0) {
6215 SGPRsUsed.insert(MO.getReg());
6216 --ConstantBusLimit;
6217 continue;
6218 }
6219
6220 // If we make it this far, then the operand is not legal and we must
6221 // legalize it.
6223 }
6224
6225 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6226 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6227 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6228 legalizeOpWithMove(MI, VOP3Idx[2]);
6229}
6230
6233 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6234 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6235 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6236 if (DstRC)
6237 SRC = RI.getCommonSubClass(SRC, DstRC);
6238
6239 Register DstReg = MRI.createVirtualRegister(SRC);
6240 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6241
6242 if (RI.hasAGPRs(VRC)) {
6243 VRC = RI.getEquivalentVGPRClass(VRC);
6244 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6245 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6246 get(TargetOpcode::COPY), NewSrcReg)
6247 .addReg(SrcReg);
6248 SrcReg = NewSrcReg;
6249 }
6250
6251 if (SubRegs == 1) {
6252 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6253 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6254 .addReg(SrcReg);
6255 return DstReg;
6256 }
6257
6259 for (unsigned i = 0; i < SubRegs; ++i) {
6260 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6261 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6262 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6263 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6264 SRegs.push_back(SGPR);
6265 }
6266
6268 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6269 get(AMDGPU::REG_SEQUENCE), DstReg);
6270 for (unsigned i = 0; i < SubRegs; ++i) {
6271 MIB.addReg(SRegs[i]);
6272 MIB.addImm(RI.getSubRegFromChannel(i));
6273 }
6274 return DstReg;
6275}
6276
6278 MachineInstr &MI) const {
6279
6280 // If the pointer is store in VGPRs, then we need to move them to
6281 // SGPRs using v_readfirstlane. This is safe because we only select
6282 // loads with uniform pointers to SMRD instruction so we know the
6283 // pointer value is uniform.
6284 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6285 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6286 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6287 SBase->setReg(SGPR);
6288 }
6289 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6290 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6291 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6292 SOff->setReg(SGPR);
6293 }
6294}
6295
6297 unsigned Opc = Inst.getOpcode();
6298 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6299 if (OldSAddrIdx < 0)
6300 return false;
6301
6303
6304 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6305 if (NewOpc < 0)
6307 if (NewOpc < 0)
6308 return false;
6309
6311 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6312 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6313 return false;
6314
6315 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6316 if (NewVAddrIdx < 0)
6317 return false;
6318
6319 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6320
6321 // Check vaddr, it shall be zero or absent.
6322 MachineInstr *VAddrDef = nullptr;
6323 if (OldVAddrIdx >= 0) {
6324 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6325 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6326 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6327 !VAddrDef->getOperand(1).isImm() ||
6328 VAddrDef->getOperand(1).getImm() != 0)
6329 return false;
6330 }
6331
6332 const MCInstrDesc &NewDesc = get(NewOpc);
6333 Inst.setDesc(NewDesc);
6334
6335 // Callers expect iterator to be valid after this call, so modify the
6336 // instruction in place.
6337 if (OldVAddrIdx == NewVAddrIdx) {
6338 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6339 // Clear use list from the old vaddr holding a zero register.
6340 MRI.removeRegOperandFromUseList(&NewVAddr);
6341 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6342 Inst.removeOperand(OldSAddrIdx);
6343 // Update the use list with the pointer we have just moved from vaddr to
6344 // saddr position. Otherwise new vaddr will be missing from the use list.
6345 MRI.removeRegOperandFromUseList(&NewVAddr);
6346 MRI.addRegOperandToUseList(&NewVAddr);
6347 } else {
6348 assert(OldSAddrIdx == NewVAddrIdx);
6349
6350 if (OldVAddrIdx >= 0) {
6351 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6352 AMDGPU::OpName::vdst_in);
6353
6354 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6355 // it asserts. Untie the operands for now and retie them afterwards.
6356 if (NewVDstIn != -1) {
6357 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6358 Inst.untieRegOperand(OldVDstIn);
6359 }
6360
6361 Inst.removeOperand(OldVAddrIdx);
6362
6363 if (NewVDstIn != -1) {
6364 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6365 Inst.tieOperands(NewVDst, NewVDstIn);
6366 }
6367 }
6368 }
6369
6370 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6371 VAddrDef->eraseFromParent();
6372
6373 return true;
6374}
6375
6376// FIXME: Remove this when SelectionDAG is obsoleted.
6378 MachineInstr &MI) const {
6380 return;
6381
6382 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6383 // thinks they are uniform, so a readfirstlane should be valid.
6384 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6385 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6386 return;
6387
6389 return;
6390
6391 const TargetRegisterClass *DeclaredRC = getRegClass(
6392 MI.getDesc(), SAddr->getOperandNo(), &RI, *MI.getParent()->getParent());
6393
6394 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6395 SAddr->setReg(ToSGPR);
6396}
6397
6400 const TargetRegisterClass *DstRC,
6403 const DebugLoc &DL) const {
6404 Register OpReg = Op.getReg();
6405 unsigned OpSubReg = Op.getSubReg();
6406
6407 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6408 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6409
6410 // Check if operand is already the correct register class.
6411 if (DstRC == OpRC)
6412 return;
6413
6414 Register DstReg = MRI.createVirtualRegister(DstRC);
6415 auto Copy =
6416 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6417 Op.setReg(DstReg);
6418
6419 MachineInstr *Def = MRI.getVRegDef(OpReg);
6420 if (!Def)
6421 return;
6422
6423 // Try to eliminate the copy if it is copying an immediate value.
6424 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6425 foldImmediate(*Copy, *Def, OpReg, &MRI);
6426
6427 bool ImpDef = Def->isImplicitDef();
6428 while (!ImpDef && Def && Def->isCopy()) {
6429 if (Def->getOperand(1).getReg().isPhysical())
6430 break;
6431 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6432 ImpDef = Def && Def->isImplicitDef();
6433 }
6434 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6435 !ImpDef)
6436 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6437}
6438
6439// Emit the actual waterfall loop, executing the wrapped instruction for each
6440// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6441// iteration, in the worst case we execute 64 (once per lane).
6442static void
6445 MachineBasicBlock &LoopBB,
6446 MachineBasicBlock &BodyBB,
6447 const DebugLoc &DL,
6448 ArrayRef<MachineOperand *> ScalarOps) {
6449 MachineFunction &MF = *LoopBB.getParent();
6450 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6451 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6452 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6453 unsigned SaveExecOpc =
6454 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6455 unsigned XorTermOpc =
6456 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6457 unsigned AndOpc =
6458 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6459 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6460
6462 Register CondReg;
6463
6464 for (MachineOperand *ScalarOp : ScalarOps) {
6465 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6466 unsigned NumSubRegs = RegSize / 32;
6467 Register VScalarOp = ScalarOp->getReg();
6468
6469 if (NumSubRegs == 1) {
6470 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6471
6472 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6473 .addReg(VScalarOp);
6474
6475 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6476
6477 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6478 .addReg(CurReg)
6479 .addReg(VScalarOp);
6480
6481 // Combine the comparison results with AND.
6482 if (!CondReg) // First.
6483 CondReg = NewCondReg;
6484 else { // If not the first, we create an AND.
6485 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6486 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6487 .addReg(CondReg)
6488 .addReg(NewCondReg);
6489 CondReg = AndReg;
6490 }
6491
6492 // Update ScalarOp operand to use the SGPR ScalarOp.
6493 ScalarOp->setReg(CurReg);
6494 ScalarOp->setIsKill();
6495 } else {
6496 SmallVector<Register, 8> ReadlanePieces;
6497 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6498 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6499 "Unhandled register size");
6500
6501 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6502 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6503 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6504
6505 // Read the next variant <- also loop target.
6506 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6507 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6508
6509 // Read the next variant <- also loop target.
6510 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6511 .addReg(VScalarOp, VScalarOpUndef,
6512 TRI->getSubRegFromChannel(Idx + 1));
6513
6514 ReadlanePieces.push_back(CurRegLo);
6515 ReadlanePieces.push_back(CurRegHi);
6516
6517 // Comparison is to be done as 64-bit.
6518 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6519 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6520 .addReg(CurRegLo)
6521 .addImm(AMDGPU::sub0)
6522 .addReg(CurRegHi)
6523 .addImm(AMDGPU::sub1);
6524
6525 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6526 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6527 NewCondReg)
6528 .addReg(CurReg);
6529 if (NumSubRegs <= 2)
6530 Cmp.addReg(VScalarOp);
6531 else
6532 Cmp.addReg(VScalarOp, VScalarOpUndef,
6533 TRI->getSubRegFromChannel(Idx, 2));
6534
6535 // Combine the comparison results with AND.
6536 if (!CondReg) // First.
6537 CondReg = NewCondReg;
6538 else { // If not the first, we create an AND.
6539 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6540 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6541 .addReg(CondReg)
6542 .addReg(NewCondReg);
6543 CondReg = AndReg;
6544 }
6545 } // End for loop.
6546
6547 const auto *SScalarOpRC =
6548 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6549 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6550
6551 // Build scalar ScalarOp.
6552 auto Merge =
6553 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6554 unsigned Channel = 0;
6555 for (Register Piece : ReadlanePieces) {
6556 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6557 }
6558
6559 // Update ScalarOp operand to use the SGPR ScalarOp.
6560 ScalarOp->setReg(SScalarOp);
6561 ScalarOp->setIsKill();
6562 }
6563 }
6564
6565 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6566 MRI.setSimpleHint(SaveExec, CondReg);
6567
6568 // Update EXEC to matching lanes, saving original to SaveExec.
6569 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6570 .addReg(CondReg, RegState::Kill);
6571
6572 // The original instruction is here; we insert the terminators after it.
6573 I = BodyBB.end();
6574
6575 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6576 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6577 .addReg(Exec)
6578 .addReg(SaveExec);
6579
6580 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6581}
6582
6583// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6584// with SGPRs by iterating over all unique values across all lanes.
6585// Returns the loop basic block that now contains \p MI.
6586static MachineBasicBlock *
6590 MachineBasicBlock::iterator Begin = nullptr,
6591 MachineBasicBlock::iterator End = nullptr) {
6592 MachineBasicBlock &MBB = *MI.getParent();
6593 MachineFunction &MF = *MBB.getParent();
6594 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6595 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6597 if (!Begin.isValid())
6598 Begin = &MI;
6599 if (!End.isValid()) {
6600 End = &MI;
6601 ++End;
6602 }
6603 const DebugLoc &DL = MI.getDebugLoc();
6604 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6605 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6606 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6607
6608 // Save SCC. Waterfall Loop may overwrite SCC.
6609 Register SaveSCCReg;
6610
6611 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6612 // rather than unlimited scan everywhere
6613 bool SCCNotDead =
6614 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6615 std::numeric_limits<unsigned>::max()) !=
6617 if (SCCNotDead) {
6618 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6619 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6620 .addImm(1)
6621 .addImm(0);
6622 }
6623
6624 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6625
6626 // Save the EXEC mask
6627 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6628
6629 // Killed uses in the instruction we are waterfalling around will be
6630 // incorrect due to the added control-flow.
6632 ++AfterMI;
6633 for (auto I = Begin; I != AfterMI; I++) {
6634 for (auto &MO : I->all_uses())
6635 MRI.clearKillFlags(MO.getReg());
6636 }
6637
6638 // To insert the loop we need to split the block. Move everything after this
6639 // point to a new block, and insert a new empty block between the two.
6642 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6644 ++MBBI;
6645
6646 MF.insert(MBBI, LoopBB);
6647 MF.insert(MBBI, BodyBB);
6648 MF.insert(MBBI, RemainderBB);
6649
6650 LoopBB->addSuccessor(BodyBB);
6651 BodyBB->addSuccessor(LoopBB);
6652 BodyBB->addSuccessor(RemainderBB);
6653
6654 // Move Begin to MI to the BodyBB, and the remainder of the block to
6655 // RemainderBB.
6656 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6657 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6658 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6659
6660 MBB.addSuccessor(LoopBB);
6661
6662 // Update dominators. We know that MBB immediately dominates LoopBB, that
6663 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6664 // RemainderBB. RemainderBB immediately dominates all of the successors
6665 // transferred to it from MBB that MBB used to properly dominate.
6666 if (MDT) {
6667 MDT->addNewBlock(LoopBB, &MBB);
6668 MDT->addNewBlock(BodyBB, LoopBB);
6669 MDT->addNewBlock(RemainderBB, BodyBB);
6670 for (auto &Succ : RemainderBB->successors()) {
6671 if (MDT->properlyDominates(&MBB, Succ)) {
6672 MDT->changeImmediateDominator(Succ, RemainderBB);
6673 }
6674 }
6675 }
6676
6677 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
6678
6679 MachineBasicBlock::iterator First = RemainderBB->begin();
6680 // Restore SCC
6681 if (SCCNotDead) {
6682 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6683 .addReg(SaveSCCReg, RegState::Kill)
6684 .addImm(0);
6685 }
6686
6687 // Restore the EXEC mask
6688 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6689 return BodyBB;
6690}
6691
6692// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6693static std::tuple<unsigned, unsigned>
6695 MachineBasicBlock &MBB = *MI.getParent();
6696 MachineFunction &MF = *MBB.getParent();
6698
6699 // Extract the ptr from the resource descriptor.
6700 unsigned RsrcPtr =
6701 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6702 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6703
6704 // Create an empty resource descriptor
6705 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6706 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6707 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6708 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6709 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6710
6711 // Zero64 = 0
6712 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6713 .addImm(0);
6714
6715 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6716 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6717 .addImm(Lo_32(RsrcDataFormat));
6718
6719 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6720 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6721 .addImm(Hi_32(RsrcDataFormat));
6722
6723 // NewSRsrc = {Zero64, SRsrcFormat}
6724 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6725 .addReg(Zero64)
6726 .addImm(AMDGPU::sub0_sub1)
6727 .addReg(SRsrcFormatLo)
6728 .addImm(AMDGPU::sub2)
6729 .addReg(SRsrcFormatHi)
6730 .addImm(AMDGPU::sub3);
6731
6732 return std::tuple(RsrcPtr, NewSRsrc);
6733}
6734
6737 MachineDominatorTree *MDT) const {
6738 MachineFunction &MF = *MI.getParent()->getParent();
6740 MachineBasicBlock *CreatedBB = nullptr;
6741
6742 // Legalize VOP2
6743 if (isVOP2(MI) || isVOPC(MI)) {
6745 return CreatedBB;
6746 }
6747
6748 // Legalize VOP3
6749 if (isVOP3(MI)) {
6751 return CreatedBB;
6752 }
6753
6754 // Legalize SMRD
6755 if (isSMRD(MI)) {
6757 return CreatedBB;
6758 }
6759
6760 // Legalize FLAT
6761 if (isFLAT(MI)) {
6763 return CreatedBB;
6764 }
6765
6766 // Legalize REG_SEQUENCE and PHI
6767 // The register class of the operands much be the same type as the register
6768 // class of the output.
6769 if (MI.getOpcode() == AMDGPU::PHI) {
6770 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6771 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6772 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6773 continue;
6774 const TargetRegisterClass *OpRC =
6775 MRI.getRegClass(MI.getOperand(i).getReg());
6776 if (RI.hasVectorRegisters(OpRC)) {
6777 VRC = OpRC;
6778 } else {
6779 SRC = OpRC;
6780 }
6781 }
6782
6783 // If any of the operands are VGPR registers, then they all most be
6784 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6785 // them.
6786 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6787 if (!VRC) {
6788 assert(SRC);
6789 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6790 VRC = &AMDGPU::VReg_1RegClass;
6791 } else
6792 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6793 ? RI.getEquivalentAGPRClass(SRC)
6794 : RI.getEquivalentVGPRClass(SRC);
6795 } else {
6796 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6797 ? RI.getEquivalentAGPRClass(VRC)
6798 : RI.getEquivalentVGPRClass(VRC);
6799 }
6800 RC = VRC;
6801 } else {
6802 RC = SRC;
6803 }
6804
6805 // Update all the operands so they have the same type.
6806 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6807 MachineOperand &Op = MI.getOperand(I);
6808 if (!Op.isReg() || !Op.getReg().isVirtual())
6809 continue;
6810
6811 // MI is a PHI instruction.
6812 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6814
6815 // Avoid creating no-op copies with the same src and dst reg class. These
6816 // confuse some of the machine passes.
6817 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6818 }
6819 }
6820
6821 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6822 // VGPR dest type and SGPR sources, insert copies so all operands are
6823 // VGPRs. This seems to help operand folding / the register coalescer.
6824 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6825 MachineBasicBlock *MBB = MI.getParent();
6826 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6827 if (RI.hasVGPRs(DstRC)) {
6828 // Update all the operands so they are VGPR register classes. These may
6829 // not be the same register class because REG_SEQUENCE supports mixing
6830 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6831 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6832 MachineOperand &Op = MI.getOperand(I);
6833 if (!Op.isReg() || !Op.getReg().isVirtual())
6834 continue;
6835
6836 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6837 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6838 if (VRC == OpRC)
6839 continue;
6840
6841 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6842 Op.setIsKill();
6843 }
6844 }
6845
6846 return CreatedBB;
6847 }
6848
6849 // Legalize INSERT_SUBREG
6850 // src0 must have the same register class as dst
6851 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6852 Register Dst = MI.getOperand(0).getReg();
6853 Register Src0 = MI.getOperand(1).getReg();
6854 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6855 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6856 if (DstRC != Src0RC) {
6857 MachineBasicBlock *MBB = MI.getParent();
6858 MachineOperand &Op = MI.getOperand(1);
6859 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6860 }
6861 return CreatedBB;
6862 }
6863
6864 // Legalize SI_INIT_M0
6865 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6866 MachineOperand &Src = MI.getOperand(0);
6867 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6868 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6869 return CreatedBB;
6870 }
6871
6872 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6873 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6874 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6875 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6876 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6877 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
6878 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
6879 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
6880 MachineOperand &Src = MI.getOperand(1);
6881 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6882 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6883 return CreatedBB;
6884 }
6885
6886 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6887 //
6888 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6889 // scratch memory access. In both cases, the legalization never involves
6890 // conversion to the addr64 form.
6892 (isMUBUF(MI) || isMTBUF(MI)))) {
6893 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6894 : AMDGPU::OpName::srsrc;
6895 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6896 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6897 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6898
6899 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6900 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6901 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6902 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6903
6904 return CreatedBB;
6905 }
6906
6907 // Legalize SI_CALL
6908 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6909 MachineOperand *Dest = &MI.getOperand(0);
6910 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6911 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6912 // following copies, we also need to move copies from and to physical
6913 // registers into the loop block.
6914 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6915 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6916
6917 // Also move the copies to physical registers into the loop block
6918 MachineBasicBlock &MBB = *MI.getParent();
6920 while (Start->getOpcode() != FrameSetupOpcode)
6921 --Start;
6923 while (End->getOpcode() != FrameDestroyOpcode)
6924 ++End;
6925 // Also include following copies of the return value
6926 ++End;
6927 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6928 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
6929 ++End;
6930 CreatedBB =
6931 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6932 }
6933 }
6934
6935 // Legalize s_sleep_var.
6936 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6937 const DebugLoc &DL = MI.getDebugLoc();
6938 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6939 int Src0Idx =
6940 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6941 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6942 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6943 .add(Src0);
6944 Src0.ChangeToRegister(Reg, false);
6945 return nullptr;
6946 }
6947
6948 // Legalize MUBUF instructions.
6949 bool isSoffsetLegal = true;
6950 int SoffsetIdx =
6951 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6952 if (SoffsetIdx != -1) {
6953 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6954 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6955 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6956 isSoffsetLegal = false;
6957 }
6958 }
6959
6960 bool isRsrcLegal = true;
6961 int RsrcIdx =
6962 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6963 if (RsrcIdx != -1) {
6964 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6965 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
6966 isRsrcLegal = false;
6967 }
6968
6969 // The operands are legal.
6970 if (isRsrcLegal && isSoffsetLegal)
6971 return CreatedBB;
6972
6973 if (!isRsrcLegal) {
6974 // Legalize a VGPR Rsrc
6975 //
6976 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6977 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6978 // a zero-value SRsrc.
6979 //
6980 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6981 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6982 // above.
6983 //
6984 // Otherwise we are on non-ADDR64 hardware, and/or we have
6985 // idxen/offen/bothen and we fall back to a waterfall loop.
6986
6987 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6988 MachineBasicBlock &MBB = *MI.getParent();
6989
6990 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6991 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6992 // This is already an ADDR64 instruction so we need to add the pointer
6993 // extracted from the resource descriptor to the current value of VAddr.
6994 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6995 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6996 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6997
6998 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
6999 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7000 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7001
7002 unsigned RsrcPtr, NewSRsrc;
7003 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7004
7005 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7006 const DebugLoc &DL = MI.getDebugLoc();
7007 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7008 .addDef(CondReg0)
7009 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7010 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7011 .addImm(0);
7012
7013 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7014 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7015 .addDef(CondReg1, RegState::Dead)
7016 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7017 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7018 .addReg(CondReg0, RegState::Kill)
7019 .addImm(0);
7020
7021 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7022 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7023 .addReg(NewVAddrLo)
7024 .addImm(AMDGPU::sub0)
7025 .addReg(NewVAddrHi)
7026 .addImm(AMDGPU::sub1);
7027
7028 VAddr->setReg(NewVAddr);
7029 Rsrc->setReg(NewSRsrc);
7030 } else if (!VAddr && ST.hasAddr64()) {
7031 // This instructions is the _OFFSET variant, so we need to convert it to
7032 // ADDR64.
7034 "FIXME: Need to emit flat atomics here");
7035
7036 unsigned RsrcPtr, NewSRsrc;
7037 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7038
7039 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7040 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7041 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7042 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7043 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7044
7045 // Atomics with return have an additional tied operand and are
7046 // missing some of the special bits.
7047 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7048 MachineInstr *Addr64;
7049
7050 if (!VDataIn) {
7051 // Regular buffer load / store.
7053 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7054 .add(*VData)
7055 .addReg(NewVAddr)
7056 .addReg(NewSRsrc)
7057 .add(*SOffset)
7058 .add(*Offset);
7059
7060 if (const MachineOperand *CPol =
7061 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7062 MIB.addImm(CPol->getImm());
7063 }
7064
7065 if (const MachineOperand *TFE =
7066 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7067 MIB.addImm(TFE->getImm());
7068 }
7069
7070 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7071
7072 MIB.cloneMemRefs(MI);
7073 Addr64 = MIB;
7074 } else {
7075 // Atomics with return.
7076 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7077 .add(*VData)
7078 .add(*VDataIn)
7079 .addReg(NewVAddr)
7080 .addReg(NewSRsrc)
7081 .add(*SOffset)
7082 .add(*Offset)
7083 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7084 .cloneMemRefs(MI);
7085 }
7086
7087 MI.removeFromParent();
7088
7089 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7090 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7091 NewVAddr)
7092 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7093 .addImm(AMDGPU::sub0)
7094 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7095 .addImm(AMDGPU::sub1);
7096 } else {
7097 // Legalize a VGPR Rsrc and soffset together.
7098 if (!isSoffsetLegal) {
7099 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7100 CreatedBB =
7101 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7102 return CreatedBB;
7103 }
7104 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7105 return CreatedBB;
7106 }
7107 }
7108
7109 // Legalize a VGPR soffset.
7110 if (!isSoffsetLegal) {
7111 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7112 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7113 return CreatedBB;
7114 }
7115 return CreatedBB;
7116}
7117
7119 InstrList.insert(MI);
7120 // Add MBUF instructiosn to deferred list.
7121 int RsrcIdx =
7122 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7123 if (RsrcIdx != -1) {
7124 DeferredList.insert(MI);
7125 }
7126}
7127
7129 return DeferredList.contains(MI);
7130}
7131
7133 MachineDominatorTree *MDT) const {
7134
7135 while (!Worklist.empty()) {
7136 MachineInstr &Inst = *Worklist.top();
7137 Worklist.erase_top();
7138 // Skip MachineInstr in the deferred list.
7139 if (Worklist.isDeferred(&Inst))
7140 continue;
7141 moveToVALUImpl(Worklist, MDT, Inst);
7142 }
7143
7144 // Deferred list of instructions will be processed once
7145 // all the MachineInstr in the worklist are done.
7146 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7147 moveToVALUImpl(Worklist, MDT, *Inst);
7148 assert(Worklist.empty() &&
7149 "Deferred MachineInstr are not supposed to re-populate worklist");
7150 }
7151}
7152
7155 MachineInstr &Inst) const {
7156
7158 if (!MBB)
7159 return;
7161 unsigned Opcode = Inst.getOpcode();
7162 unsigned NewOpcode = getVALUOp(Inst);
7163 // Handle some special cases
7164 switch (Opcode) {
7165 default:
7166 break;
7167 case AMDGPU::S_ADD_U64_PSEUDO:
7168 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
7169 break;
7170 case AMDGPU::S_SUB_U64_PSEUDO:
7171 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
7172 break;
7173 case AMDGPU::S_ADD_I32:
7174 case AMDGPU::S_SUB_I32: {
7175 // FIXME: The u32 versions currently selected use the carry.
7176 bool Changed;
7177 MachineBasicBlock *CreatedBBTmp = nullptr;
7178 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7179 if (Changed)
7180 return;
7181
7182 // Default handling
7183 break;
7184 }
7185
7186 case AMDGPU::S_MUL_U64:
7187 // Split s_mul_u64 in 32-bit vector multiplications.
7188 splitScalarSMulU64(Worklist, Inst, MDT);
7189 Inst.eraseFromParent();
7190 return;
7191
7192 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7193 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7194 // This is a special case of s_mul_u64 where all the operands are either
7195 // zero extended or sign extended.
7196 splitScalarSMulPseudo(Worklist, Inst, MDT);
7197 Inst.eraseFromParent();
7198 return;
7199
7200 case AMDGPU::S_AND_B64:
7201 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7202 Inst.eraseFromParent();
7203 return;
7204
7205 case AMDGPU::S_OR_B64:
7206 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7207 Inst.eraseFromParent();
7208 return;
7209
7210 case AMDGPU::S_XOR_B64:
7211 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7212 Inst.eraseFromParent();
7213 return;
7214
7215 case AMDGPU::S_NAND_B64:
7216 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7217 Inst.eraseFromParent();
7218 return;
7219
7220 case AMDGPU::S_NOR_B64:
7221 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7222 Inst.eraseFromParent();
7223 return;
7224
7225 case AMDGPU::S_XNOR_B64:
7226 if (ST.hasDLInsts())
7227 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7228 else
7229 splitScalar64BitXnor(Worklist, Inst, MDT);
7230 Inst.eraseFromParent();
7231 return;
7232
7233 case AMDGPU::S_ANDN2_B64:
7234 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7235 Inst.eraseFromParent();
7236 return;
7237
7238 case AMDGPU::S_ORN2_B64:
7239 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7240 Inst.eraseFromParent();
7241 return;
7242
7243 case AMDGPU::S_BREV_B64:
7244 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7245 Inst.eraseFromParent();
7246 return;
7247
7248 case AMDGPU::S_NOT_B64:
7249 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7250 Inst.eraseFromParent();
7251 return;
7252
7253 case AMDGPU::S_BCNT1_I32_B64:
7254 splitScalar64BitBCNT(Worklist, Inst);
7255 Inst.eraseFromParent();
7256 return;
7257
7258 case AMDGPU::S_BFE_I64:
7259 splitScalar64BitBFE(Worklist, Inst);
7260 Inst.eraseFromParent();
7261 return;
7262
7263 case AMDGPU::S_FLBIT_I32_B64:
7264 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7265 Inst.eraseFromParent();
7266 return;
7267 case AMDGPU::S_FF1_I32_B64:
7268 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7269 Inst.eraseFromParent();
7270 return;
7271
7272 case AMDGPU::S_LSHL_B32:
7273 if (ST.hasOnlyRevVALUShifts()) {
7274 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7275 swapOperands(Inst);
7276 }
7277 break;
7278 case AMDGPU::S_ASHR_I32:
7279 if (ST.hasOnlyRevVALUShifts()) {
7280 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7281 swapOperands(Inst);
7282 }
7283 break;
7284 case AMDGPU::S_LSHR_B32:
7285 if (ST.hasOnlyRevVALUShifts()) {
7286 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7287 swapOperands(Inst);
7288 }
7289 break;
7290 case AMDGPU::S_LSHL_B64:
7291 if (ST.hasOnlyRevVALUShifts()) {
7292 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7293 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7294 : AMDGPU::V_LSHLREV_B64_e64;
7295 swapOperands(Inst);
7296 }
7297 break;
7298 case AMDGPU::S_ASHR_I64:
7299 if (ST.hasOnlyRevVALUShifts()) {
7300 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7301 swapOperands(Inst);
7302 }
7303 break;
7304 case AMDGPU::S_LSHR_B64:
7305 if (ST.hasOnlyRevVALUShifts()) {
7306 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7307 swapOperands(Inst);
7308 }
7309 break;
7310
7311 case AMDGPU::S_ABS_I32:
7312 lowerScalarAbs(Worklist, Inst);
7313 Inst.eraseFromParent();
7314 return;
7315
7316 case AMDGPU::S_CBRANCH_SCC0:
7317 case AMDGPU::S_CBRANCH_SCC1: {
7318 // Clear unused bits of vcc
7319 Register CondReg = Inst.getOperand(1).getReg();
7320 bool IsSCC = CondReg == AMDGPU::SCC;
7321 Register VCC = RI.getVCC();
7322 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7323 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7324 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7325 .addReg(EXEC)
7326 .addReg(IsSCC ? VCC : CondReg);
7327 Inst.removeOperand(1);
7328 } break;
7329
7330 case AMDGPU::S_BFE_U64:
7331 case AMDGPU::S_BFM_B64:
7332 llvm_unreachable("Moving this op to VALU not implemented");
7333
7334 case AMDGPU::S_PACK_LL_B32_B16:
7335 case AMDGPU::S_PACK_LH_B32_B16:
7336 case AMDGPU::S_PACK_HL_B32_B16:
7337 case AMDGPU::S_PACK_HH_B32_B16:
7338 movePackToVALU(Worklist, MRI, Inst);
7339 Inst.eraseFromParent();
7340 return;
7341
7342 case AMDGPU::S_XNOR_B32:
7343 lowerScalarXnor(Worklist, Inst);
7344 Inst.eraseFromParent();
7345 return;
7346
7347 case AMDGPU::S_NAND_B32:
7348 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7349 Inst.eraseFromParent();
7350 return;
7351
7352 case AMDGPU::S_NOR_B32:
7353 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7354 Inst.eraseFromParent();
7355 return;
7356
7357 case AMDGPU::S_ANDN2_B32:
7358 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7359 Inst.eraseFromParent();
7360 return;
7361
7362 case AMDGPU::S_ORN2_B32:
7363 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7364 Inst.eraseFromParent();
7365 return;
7366
7367 // TODO: remove as soon as everything is ready
7368 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7369 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7370 // can only be selected from the uniform SDNode.
7371 case AMDGPU::S_ADD_CO_PSEUDO:
7372 case AMDGPU::S_SUB_CO_PSEUDO: {
7373 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7374 ? AMDGPU::V_ADDC_U32_e64
7375 : AMDGPU::V_SUBB_U32_e64;
7376 const auto *CarryRC = RI.getWaveMaskRegClass();
7377
7378 Register CarryInReg = Inst.getOperand(4).getReg();
7379 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7380 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7381 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7382 .addReg(CarryInReg);
7383 }
7384
7385 Register CarryOutReg = Inst.getOperand(1).getReg();
7386
7387 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7388 MRI.getRegClass(Inst.getOperand(0).getReg())));
7389 MachineInstr *CarryOp =
7390 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7391 .addReg(CarryOutReg, RegState::Define)
7392 .add(Inst.getOperand(2))
7393 .add(Inst.getOperand(3))
7394 .addReg(CarryInReg)
7395 .addImm(0);
7396 legalizeOperands(*CarryOp);
7397 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7398 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7399 Inst.eraseFromParent();
7400 }
7401 return;
7402 case AMDGPU::S_UADDO_PSEUDO:
7403 case AMDGPU::S_USUBO_PSEUDO: {
7404 const DebugLoc &DL = Inst.getDebugLoc();
7405 MachineOperand &Dest0 = Inst.getOperand(0);
7406 MachineOperand &Dest1 = Inst.getOperand(1);
7407 MachineOperand &Src0 = Inst.getOperand(2);
7408 MachineOperand &Src1 = Inst.getOperand(3);
7409
7410 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7411 ? AMDGPU::V_ADD_CO_U32_e64
7412 : AMDGPU::V_SUB_CO_U32_e64;
7413 const TargetRegisterClass *NewRC =
7414 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7415 Register DestReg = MRI.createVirtualRegister(NewRC);
7416 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7417 .addReg(Dest1.getReg(), RegState::Define)
7418 .add(Src0)
7419 .add(Src1)
7420 .addImm(0); // clamp bit
7421
7422 legalizeOperands(*NewInstr, MDT);
7423 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7424 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7425 Worklist);
7426 Inst.eraseFromParent();
7427 }
7428 return;
7429
7430 case AMDGPU::S_CSELECT_B32:
7431 case AMDGPU::S_CSELECT_B64:
7432 lowerSelect(Worklist, Inst, MDT);
7433 Inst.eraseFromParent();
7434 return;
7435 case AMDGPU::S_CMP_EQ_I32:
7436 case AMDGPU::S_CMP_LG_I32:
7437 case AMDGPU::S_CMP_GT_I32:
7438 case AMDGPU::S_CMP_GE_I32:
7439 case AMDGPU::S_CMP_LT_I32:
7440 case AMDGPU::S_CMP_LE_I32:
7441 case AMDGPU::S_CMP_EQ_U32:
7442 case AMDGPU::S_CMP_LG_U32:
7443 case AMDGPU::S_CMP_GT_U32:
7444 case AMDGPU::S_CMP_GE_U32:
7445 case AMDGPU::S_CMP_LT_U32:
7446 case AMDGPU::S_CMP_LE_U32:
7447 case AMDGPU::S_CMP_EQ_U64:
7448 case AMDGPU::S_CMP_LG_U64:
7449 case AMDGPU::S_CMP_LT_F32:
7450 case AMDGPU::S_CMP_EQ_F32:
7451 case AMDGPU::S_CMP_LE_F32:
7452 case AMDGPU::S_CMP_GT_F32:
7453 case AMDGPU::S_CMP_LG_F32:
7454 case AMDGPU::S_CMP_GE_F32:
7455 case AMDGPU::S_CMP_O_F32:
7456 case AMDGPU::S_CMP_U_F32:
7457 case AMDGPU::S_CMP_NGE_F32:
7458 case AMDGPU::S_CMP_NLG_F32:
7459 case AMDGPU::S_CMP_NGT_F32:
7460 case AMDGPU::S_CMP_NLE_F32:
7461 case AMDGPU::S_CMP_NEQ_F32:
7462 case AMDGPU::S_CMP_NLT_F32: {
7463 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7464 auto NewInstr =
7465 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7466 .setMIFlags(Inst.getFlags());
7467 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7468 0) {
7469 NewInstr
7470 .addImm(0) // src0_modifiers
7471 .add(Inst.getOperand(0)) // src0
7472 .addImm(0) // src1_modifiers
7473 .add(Inst.getOperand(1)) // src1
7474 .addImm(0); // clamp
7475 } else {
7476 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7477 }
7478 legalizeOperands(*NewInstr, MDT);
7479 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7480 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7481 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7482 Inst.eraseFromParent();
7483 return;
7484 }
7485 case AMDGPU::S_CMP_LT_F16:
7486 case AMDGPU::S_CMP_EQ_F16:
7487 case AMDGPU::S_CMP_LE_F16:
7488 case AMDGPU::S_CMP_GT_F16:
7489 case AMDGPU::S_CMP_LG_F16:
7490 case AMDGPU::S_CMP_GE_F16:
7491 case AMDGPU::S_CMP_O_F16:
7492 case AMDGPU::S_CMP_U_F16:
7493 case AMDGPU::S_CMP_NGE_F16:
7494 case AMDGPU::S_CMP_NLG_F16:
7495 case AMDGPU::S_CMP_NGT_F16:
7496 case AMDGPU::S_CMP_NLE_F16:
7497 case AMDGPU::S_CMP_NEQ_F16:
7498 case AMDGPU::S_CMP_NLT_F16: {
7499 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7500 auto NewInstr =
7501 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7502 .setMIFlags(Inst.getFlags());
7503 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7504 NewInstr
7505 .addImm(0) // src0_modifiers
7506 .add(Inst.getOperand(0)) // src0
7507 .addImm(0) // src1_modifiers
7508 .add(Inst.getOperand(1)) // src1
7509 .addImm(0); // clamp
7510 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7511 NewInstr.addImm(0); // op_sel0
7512 } else {
7513 NewInstr
7514 .add(Inst.getOperand(0))
7515 .add(Inst.getOperand(1));
7516 }
7517 legalizeOperands(*NewInstr, MDT);
7518 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7519 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7520 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7521 Inst.eraseFromParent();
7522 return;
7523 }
7524 case AMDGPU::S_CVT_HI_F32_F16: {
7525 const DebugLoc &DL = Inst.getDebugLoc();
7526 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7527 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7528 if (ST.useRealTrue16Insts()) {
7529 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7530 .add(Inst.getOperand(1));
7531 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7532 .addImm(0) // src0_modifiers
7533 .addReg(TmpReg, 0, AMDGPU::hi16)
7534 .addImm(0) // clamp
7535 .addImm(0) // omod
7536 .addImm(0); // op_sel0
7537 } else {
7538 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7539 .addImm(16)
7540 .add(Inst.getOperand(1));
7541 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7542 .addImm(0) // src0_modifiers
7543 .addReg(TmpReg)
7544 .addImm(0) // clamp
7545 .addImm(0); // omod
7546 }
7547
7548 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7549 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7550 Inst.eraseFromParent();
7551 return;
7552 }
7553 case AMDGPU::S_MINIMUM_F32:
7554 case AMDGPU::S_MAXIMUM_F32: {
7555 const DebugLoc &DL = Inst.getDebugLoc();
7556 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7557 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7558 .addImm(0) // src0_modifiers
7559 .add(Inst.getOperand(1))
7560 .addImm(0) // src1_modifiers
7561 .add(Inst.getOperand(2))
7562 .addImm(0) // clamp
7563 .addImm(0); // omod
7564 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7565
7566 legalizeOperands(*NewInstr, MDT);
7567 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7568 Inst.eraseFromParent();
7569 return;
7570 }
7571 case AMDGPU::S_MINIMUM_F16:
7572 case AMDGPU::S_MAXIMUM_F16: {
7573 const DebugLoc &DL = Inst.getDebugLoc();
7574 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
7575 ? &AMDGPU::VGPR_16RegClass
7576 : &AMDGPU::VGPR_32RegClass);
7577 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7578 .addImm(0) // src0_modifiers
7579 .add(Inst.getOperand(1))
7580 .addImm(0) // src1_modifiers
7581 .add(Inst.getOperand(2))
7582 .addImm(0) // clamp
7583 .addImm(0) // omod
7584 .addImm(0); // opsel0
7585 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7586 legalizeOperands(*NewInstr, MDT);
7587 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7588 Inst.eraseFromParent();
7589 return;
7590 }
7591 }
7592
7593 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7594 // We cannot move this instruction to the VALU, so we should try to
7595 // legalize its operands instead.
7596 legalizeOperands(Inst, MDT);
7597 return;
7598 }
7599 // Handle converting generic instructions like COPY-to-SGPR into
7600 // COPY-to-VGPR.
7601 if (NewOpcode == Opcode) {
7602 Register DstReg = Inst.getOperand(0).getReg();
7603 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7604
7605 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7606 // hope for the best.
7607 if (Inst.isCopy() && DstReg.isPhysical() &&
7608 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7609 // TODO: Only works for 32 bit registers.
7610 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7611 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7612 .add(Inst.getOperand(1));
7613 Inst.eraseFromParent();
7614 return;
7615 }
7616
7617 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7618 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7619 // Instead of creating a copy where src and dst are the same register
7620 // class, we just replace all uses of dst with src. These kinds of
7621 // copies interfere with the heuristics MachineSink uses to decide
7622 // whether or not to split a critical edge. Since the pass assumes
7623 // that copies will end up as machine instructions and not be
7624 // eliminated.
7625 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7626 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7627 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7628 Inst.getOperand(0).setReg(DstReg);
7629 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7630 // these are deleted later, but at -O0 it would leave a suspicious
7631 // looking illegal copy of an undef register.
7632 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7633 Inst.removeOperand(I);
7634 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7635 return;
7636 }
7637 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7638 MRI.replaceRegWith(DstReg, NewDstReg);
7639 legalizeOperands(Inst, MDT);
7640 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7641 return;
7642 }
7643
7644 // Use the new VALU Opcode.
7645 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7646 .setMIFlags(Inst.getFlags());
7647 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7648 // Intersperse VOP3 modifiers among the SALU operands.
7649 NewInstr->addOperand(Inst.getOperand(0));
7650 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7651 AMDGPU::OpName::src0_modifiers) >= 0)
7652 NewInstr.addImm(0);
7653 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7654 MachineOperand Src = Inst.getOperand(1);
7655 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7656 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7657 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7658 else
7659 NewInstr->addOperand(Src);
7660 }
7661
7662 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7663 // We are converting these to a BFE, so we need to add the missing
7664 // operands for the size and offset.
7665 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7666 NewInstr.addImm(0);
7667 NewInstr.addImm(Size);
7668 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7669 // The VALU version adds the second operand to the result, so insert an
7670 // extra 0 operand.
7671 NewInstr.addImm(0);
7672 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7673 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7674 // If we need to move this to VGPRs, we need to unpack the second
7675 // operand back into the 2 separate ones for bit offset and width.
7676 assert(OffsetWidthOp.isImm() &&
7677 "Scalar BFE is only implemented for constant width and offset");
7678 uint32_t Imm = OffsetWidthOp.getImm();
7679
7680 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7681 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7682 NewInstr.addImm(Offset);
7683 NewInstr.addImm(BitWidth);
7684 } else {
7685 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7686 AMDGPU::OpName::src1_modifiers) >= 0)
7687 NewInstr.addImm(0);
7688 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7689 NewInstr->addOperand(Inst.getOperand(2));
7690 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7691 AMDGPU::OpName::src2_modifiers) >= 0)
7692 NewInstr.addImm(0);
7693 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7694 NewInstr->addOperand(Inst.getOperand(3));
7695 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7696 NewInstr.addImm(0);
7697 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7698 NewInstr.addImm(0);
7699 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7700 NewInstr.addImm(0);
7701 }
7702 } else {
7703 // Just copy the SALU operands.
7704 for (const MachineOperand &Op : Inst.explicit_operands())
7705 NewInstr->addOperand(Op);
7706 }
7707
7708 // Remove any references to SCC. Vector instructions can't read from it, and
7709 // We're just about to add the implicit use / defs of VCC, and we don't want
7710 // both.
7711 for (MachineOperand &Op : Inst.implicit_operands()) {
7712 if (Op.getReg() == AMDGPU::SCC) {
7713 // Only propagate through live-def of SCC.
7714 if (Op.isDef() && !Op.isDead())
7715 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7716 if (Op.isUse())
7717 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7718 }
7719 }
7720 Inst.eraseFromParent();
7721 Register NewDstReg;
7722 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7723 Register DstReg = NewInstr->getOperand(0).getReg();
7724 assert(DstReg.isVirtual());
7725 // Update the destination register class.
7726 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7727 assert(NewDstRC);
7728 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7729 MRI.replaceRegWith(DstReg, NewDstReg);
7730 }
7731 fixImplicitOperands(*NewInstr);
7732 // Legalize the operands
7733 legalizeOperands(*NewInstr, MDT);
7734 if (NewDstReg)
7735 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7736}
7737
7738// Add/sub require special handling to deal with carry outs.
7739std::pair<bool, MachineBasicBlock *>
7740SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7741 MachineDominatorTree *MDT) const {
7742 if (ST.hasAddNoCarry()) {
7743 // Assume there is no user of scc since we don't select this in that case.
7744 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7745 // is used.
7746
7747 MachineBasicBlock &MBB = *Inst.getParent();
7749
7750 Register OldDstReg = Inst.getOperand(0).getReg();
7751 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7752
7753 unsigned Opc = Inst.getOpcode();
7754 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7755
7756 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7757 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7758
7759 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7760 Inst.removeOperand(3);
7761
7762 Inst.setDesc(get(NewOpc));
7763 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7765 MRI.replaceRegWith(OldDstReg, ResultReg);
7766 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7767
7768 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7769 return std::pair(true, NewBB);
7770 }
7771
7772 return std::pair(false, nullptr);
7773}
7774
7775void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7776 MachineDominatorTree *MDT) const {
7777
7778 MachineBasicBlock &MBB = *Inst.getParent();
7780 MachineBasicBlock::iterator MII = Inst;
7781 DebugLoc DL = Inst.getDebugLoc();
7782
7783 MachineOperand &Dest = Inst.getOperand(0);
7784 MachineOperand &Src0 = Inst.getOperand(1);
7785 MachineOperand &Src1 = Inst.getOperand(2);
7786 MachineOperand &Cond = Inst.getOperand(3);
7787
7788 Register CondReg = Cond.getReg();
7789 bool IsSCC = (CondReg == AMDGPU::SCC);
7790
7791 // If this is a trivial select where the condition is effectively not SCC
7792 // (CondReg is a source of copy to SCC), then the select is semantically
7793 // equivalent to copying CondReg. Hence, there is no need to create
7794 // V_CNDMASK, we can just use that and bail out.
7795 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7796 (Src1.getImm() == 0)) {
7797 MRI.replaceRegWith(Dest.getReg(), CondReg);
7798 return;
7799 }
7800
7801 Register NewCondReg = CondReg;
7802 if (IsSCC) {
7804 NewCondReg = MRI.createVirtualRegister(TC);
7805
7806 // Now look for the closest SCC def if it is a copy
7807 // replacing the CondReg with the COPY source register
7808 bool CopyFound = false;
7809 for (MachineInstr &CandI :
7811 Inst.getParent()->rend())) {
7812 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
7813 -1) {
7814 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7815 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7816 .addReg(CandI.getOperand(1).getReg());
7817 CopyFound = true;
7818 }
7819 break;
7820 }
7821 }
7822 if (!CopyFound) {
7823 // SCC def is not a copy
7824 // Insert a trivial select instead of creating a copy, because a copy from
7825 // SCC would semantically mean just copying a single bit, but we may need
7826 // the result to be a vector condition mask that needs preserving.
7827 unsigned Opcode =
7828 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
7829 auto NewSelect =
7830 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7831 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7832 }
7833 }
7834
7835 Register NewDestReg = MRI.createVirtualRegister(
7836 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7837 MachineInstr *NewInst;
7838 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7839 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7840 .addImm(0)
7841 .add(Src1) // False
7842 .addImm(0)
7843 .add(Src0) // True
7844 .addReg(NewCondReg);
7845 } else {
7846 NewInst =
7847 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7848 .add(Src1) // False
7849 .add(Src0) // True
7850 .addReg(NewCondReg);
7851 }
7852 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7853 legalizeOperands(*NewInst, MDT);
7854 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7855}
7856
7857void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7858 MachineInstr &Inst) const {
7859 MachineBasicBlock &MBB = *Inst.getParent();
7861 MachineBasicBlock::iterator MII = Inst;
7862 DebugLoc DL = Inst.getDebugLoc();
7863
7864 MachineOperand &Dest = Inst.getOperand(0);
7865 MachineOperand &Src = Inst.getOperand(1);
7866 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7867 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7868
7869 unsigned SubOp = ST.hasAddNoCarry() ?
7870 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7871
7872 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7873 .addImm(0)
7874 .addReg(Src.getReg());
7875
7876 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7877 .addReg(Src.getReg())
7878 .addReg(TmpReg);
7879
7880 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7881 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7882}
7883
7884void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7885 MachineInstr &Inst) const {
7886 MachineBasicBlock &MBB = *Inst.getParent();
7888 MachineBasicBlock::iterator MII = Inst;
7889 const DebugLoc &DL = Inst.getDebugLoc();
7890
7891 MachineOperand &Dest = Inst.getOperand(0);
7892 MachineOperand &Src0 = Inst.getOperand(1);
7893 MachineOperand &Src1 = Inst.getOperand(2);
7894
7895 if (ST.hasDLInsts()) {
7896 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7897 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7898 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7899
7900 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7901 .add(Src0)
7902 .add(Src1);
7903
7904 MRI.replaceRegWith(Dest.getReg(), NewDest);
7905 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7906 } else {
7907 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7908 // invert either source and then perform the XOR. If either source is a
7909 // scalar register, then we can leave the inversion on the scalar unit to
7910 // achieve a better distribution of scalar and vector instructions.
7911 bool Src0IsSGPR = Src0.isReg() &&
7912 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7913 bool Src1IsSGPR = Src1.isReg() &&
7914 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7916 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7917 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7918
7919 // Build a pair of scalar instructions and add them to the work list.
7920 // The next iteration over the work list will lower these to the vector
7921 // unit as necessary.
7922 if (Src0IsSGPR) {
7923 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7924 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7925 .addReg(Temp)
7926 .add(Src1);
7927 } else if (Src1IsSGPR) {
7928 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7929 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7930 .add(Src0)
7931 .addReg(Temp);
7932 } else {
7933 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7934 .add(Src0)
7935 .add(Src1);
7936 MachineInstr *Not =
7937 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7938 Worklist.insert(Not);
7939 }
7940
7941 MRI.replaceRegWith(Dest.getReg(), NewDest);
7942
7943 Worklist.insert(Xor);
7944
7945 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7946 }
7947}
7948
7949void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7950 MachineInstr &Inst,
7951 unsigned Opcode) const {
7952 MachineBasicBlock &MBB = *Inst.getParent();
7954 MachineBasicBlock::iterator MII = Inst;
7955 const DebugLoc &DL = Inst.getDebugLoc();
7956
7957 MachineOperand &Dest = Inst.getOperand(0);
7958 MachineOperand &Src0 = Inst.getOperand(1);
7959 MachineOperand &Src1 = Inst.getOperand(2);
7960
7961 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7962 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7963
7964 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7965 .add(Src0)
7966 .add(Src1);
7967
7968 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7969 .addReg(Interm);
7970
7971 Worklist.insert(&Op);
7972 Worklist.insert(&Not);
7973
7974 MRI.replaceRegWith(Dest.getReg(), NewDest);
7975 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7976}
7977
7978void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7979 MachineInstr &Inst,
7980 unsigned Opcode) const {
7981 MachineBasicBlock &MBB = *Inst.getParent();
7983 MachineBasicBlock::iterator MII = Inst;
7984 const DebugLoc &DL = Inst.getDebugLoc();
7985
7986 MachineOperand &Dest = Inst.getOperand(0);
7987 MachineOperand &Src0 = Inst.getOperand(1);
7988 MachineOperand &Src1 = Inst.getOperand(2);
7989
7990 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7991 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7992
7993 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7994 .add(Src1);
7995
7996 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7997 .add(Src0)
7998 .addReg(Interm);
7999
8000 Worklist.insert(&Not);
8001 Worklist.insert(&Op);
8002
8003 MRI.replaceRegWith(Dest.getReg(), NewDest);
8004 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8005}
8006
8007void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8008 MachineInstr &Inst, unsigned Opcode,
8009 bool Swap) const {
8010 MachineBasicBlock &MBB = *Inst.getParent();
8012
8013 MachineOperand &Dest = Inst.getOperand(0);
8014 MachineOperand &Src0 = Inst.getOperand(1);
8015 DebugLoc DL = Inst.getDebugLoc();
8016
8017 MachineBasicBlock::iterator MII = Inst;
8018
8019 const MCInstrDesc &InstDesc = get(Opcode);
8020 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8021 MRI.getRegClass(Src0.getReg()) :
8022 &AMDGPU::SGPR_32RegClass;
8023
8024 const TargetRegisterClass *Src0SubRC =
8025 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8026
8027 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8028 AMDGPU::sub0, Src0SubRC);
8029
8030 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8031 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8032 const TargetRegisterClass *NewDestSubRC =
8033 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8034
8035 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8036 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8037
8038 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8039 AMDGPU::sub1, Src0SubRC);
8040
8041 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8042 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8043
8044 if (Swap)
8045 std::swap(DestSub0, DestSub1);
8046
8047 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8048 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8049 .addReg(DestSub0)
8050 .addImm(AMDGPU::sub0)
8051 .addReg(DestSub1)
8052 .addImm(AMDGPU::sub1);
8053
8054 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8055
8056 Worklist.insert(&LoHalf);
8057 Worklist.insert(&HiHalf);
8058
8059 // We don't need to legalizeOperands here because for a single operand, src0
8060 // will support any kind of input.
8061
8062 // Move all users of this moved value.
8063 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8064}
8065
8066// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8067// split the s_mul_u64 in 32-bit vector multiplications.
8068void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8069 MachineInstr &Inst,
8070 MachineDominatorTree *MDT) const {
8071 MachineBasicBlock &MBB = *Inst.getParent();
8073
8074 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8075 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8076 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8077
8078 MachineOperand &Dest = Inst.getOperand(0);
8079 MachineOperand &Src0 = Inst.getOperand(1);
8080 MachineOperand &Src1 = Inst.getOperand(2);
8081 const DebugLoc &DL = Inst.getDebugLoc();
8082 MachineBasicBlock::iterator MII = Inst;
8083
8084 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8085 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8086 const TargetRegisterClass *Src0SubRC =
8087 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8088 if (RI.isSGPRClass(Src0SubRC))
8089 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8090 const TargetRegisterClass *Src1SubRC =
8091 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8092 if (RI.isSGPRClass(Src1SubRC))
8093 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8094
8095 // First, we extract the low 32-bit and high 32-bit values from each of the
8096 // operands.
8097 MachineOperand Op0L =
8098 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8099 MachineOperand Op1L =
8100 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8101 MachineOperand Op0H =
8102 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8103 MachineOperand Op1H =
8104 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8105
8106 // The multilication is done as follows:
8107 //
8108 // Op1H Op1L
8109 // * Op0H Op0L
8110 // --------------------
8111 // Op1H*Op0L Op1L*Op0L
8112 // + Op1H*Op0H Op1L*Op0H
8113 // -----------------------------------------
8114 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8115 //
8116 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8117 // value and that would overflow.
8118 // The low 32-bit value is Op1L*Op0L.
8119 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8120
8121 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8122 MachineInstr *Op1L_Op0H =
8123 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8124 .add(Op1L)
8125 .add(Op0H);
8126
8127 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8128 MachineInstr *Op1H_Op0L =
8129 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8130 .add(Op1H)
8131 .add(Op0L);
8132
8133 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8134 MachineInstr *Carry =
8135 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8136 .add(Op1L)
8137 .add(Op0L);
8138
8139 MachineInstr *LoHalf =
8140 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8141 .add(Op1L)
8142 .add(Op0L);
8143
8144 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8145 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8146 .addReg(Op1L_Op0H_Reg)
8147 .addReg(Op1H_Op0L_Reg);
8148
8149 MachineInstr *HiHalf =
8150 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8151 .addReg(AddReg)
8152 .addReg(CarryReg);
8153
8154 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8155 .addReg(DestSub0)
8156 .addImm(AMDGPU::sub0)
8157 .addReg(DestSub1)
8158 .addImm(AMDGPU::sub1);
8159
8160 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8161
8162 // Try to legalize the operands in case we need to swap the order to keep it
8163 // valid.
8164 legalizeOperands(*Op1L_Op0H, MDT);
8165 legalizeOperands(*Op1H_Op0L, MDT);
8166 legalizeOperands(*Carry, MDT);
8167 legalizeOperands(*LoHalf, MDT);
8168 legalizeOperands(*Add, MDT);
8169 legalizeOperands(*HiHalf, MDT);
8170
8171 // Move all users of this moved value.
8172 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8173}
8174
8175// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8176// multiplications.
8177void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8178 MachineInstr &Inst,
8179 MachineDominatorTree *MDT) const {
8180 MachineBasicBlock &MBB = *Inst.getParent();
8182
8183 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8184 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8185 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8186
8187 MachineOperand &Dest = Inst.getOperand(0);
8188 MachineOperand &Src0 = Inst.getOperand(1);
8189 MachineOperand &Src1 = Inst.getOperand(2);
8190 const DebugLoc &DL = Inst.getDebugLoc();
8191 MachineBasicBlock::iterator MII = Inst;
8192
8193 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8194 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8195 const TargetRegisterClass *Src0SubRC =
8196 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8197 if (RI.isSGPRClass(Src0SubRC))
8198 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8199 const TargetRegisterClass *Src1SubRC =
8200 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8201 if (RI.isSGPRClass(Src1SubRC))
8202 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8203
8204 // First, we extract the low 32-bit and high 32-bit values from each of the
8205 // operands.
8206 MachineOperand Op0L =
8207 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8208 MachineOperand Op1L =
8209 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8210
8211 unsigned Opc = Inst.getOpcode();
8212 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8213 ? AMDGPU::V_MUL_HI_U32_e64
8214 : AMDGPU::V_MUL_HI_I32_e64;
8215 MachineInstr *HiHalf =
8216 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8217
8218 MachineInstr *LoHalf =
8219 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8220 .add(Op1L)
8221 .add(Op0L);
8222
8223 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8224 .addReg(DestSub0)
8225 .addImm(AMDGPU::sub0)
8226 .addReg(DestSub1)
8227 .addImm(AMDGPU::sub1);
8228
8229 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8230
8231 // Try to legalize the operands in case we need to swap the order to keep it
8232 // valid.
8233 legalizeOperands(*HiHalf, MDT);
8234 legalizeOperands(*LoHalf, MDT);
8235
8236 // Move all users of this moved value.
8237 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8238}
8239
8240void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8241 MachineInstr &Inst, unsigned Opcode,
8242 MachineDominatorTree *MDT) const {
8243 MachineBasicBlock &MBB = *Inst.getParent();
8245
8246 MachineOperand &Dest = Inst.getOperand(0);
8247 MachineOperand &Src0 = Inst.getOperand(1);
8248 MachineOperand &Src1 = Inst.getOperand(2);
8249 DebugLoc DL = Inst.getDebugLoc();
8250
8251 MachineBasicBlock::iterator MII = Inst;
8252
8253 const MCInstrDesc &InstDesc = get(Opcode);
8254 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8255 MRI.getRegClass(Src0.getReg()) :
8256 &AMDGPU::SGPR_32RegClass;
8257
8258 const TargetRegisterClass *Src0SubRC =
8259 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8260 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8261 MRI.getRegClass(Src1.getReg()) :
8262 &AMDGPU::SGPR_32RegClass;
8263
8264 const TargetRegisterClass *Src1SubRC =
8265 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8266
8267 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8268 AMDGPU::sub0, Src0SubRC);
8269 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8270 AMDGPU::sub0, Src1SubRC);
8271 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8272 AMDGPU::sub1, Src0SubRC);
8273 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8274 AMDGPU::sub1, Src1SubRC);
8275
8276 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8277 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8278 const TargetRegisterClass *NewDestSubRC =
8279 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8280
8281 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8282 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8283 .add(SrcReg0Sub0)
8284 .add(SrcReg1Sub0);
8285
8286 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8287 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8288 .add(SrcReg0Sub1)
8289 .add(SrcReg1Sub1);
8290
8291 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8292 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8293 .addReg(DestSub0)
8294 .addImm(AMDGPU::sub0)
8295 .addReg(DestSub1)
8296 .addImm(AMDGPU::sub1);
8297
8298 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8299
8300 Worklist.insert(&LoHalf);
8301 Worklist.insert(&HiHalf);
8302
8303 // Move all users of this moved value.
8304 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8305}
8306
8307void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8308 MachineInstr &Inst,
8309 MachineDominatorTree *MDT) const {
8310 MachineBasicBlock &MBB = *Inst.getParent();
8312
8313 MachineOperand &Dest = Inst.getOperand(0);
8314 MachineOperand &Src0 = Inst.getOperand(1);
8315 MachineOperand &Src1 = Inst.getOperand(2);
8316 const DebugLoc &DL = Inst.getDebugLoc();
8317
8318 MachineBasicBlock::iterator MII = Inst;
8319
8320 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8321
8322 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8323
8324 MachineOperand* Op0;
8325 MachineOperand* Op1;
8326
8327 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8328 Op0 = &Src0;
8329 Op1 = &Src1;
8330 } else {
8331 Op0 = &Src1;
8332 Op1 = &Src0;
8333 }
8334
8335 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8336 .add(*Op0);
8337
8338 Register NewDest = MRI.createVirtualRegister(DestRC);
8339
8340 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8341 .addReg(Interm)
8342 .add(*Op1);
8343
8344 MRI.replaceRegWith(Dest.getReg(), NewDest);
8345
8346 Worklist.insert(&Xor);
8347}
8348
8349void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8350 MachineInstr &Inst) const {
8351 MachineBasicBlock &MBB = *Inst.getParent();
8353
8354 MachineBasicBlock::iterator MII = Inst;
8355 const DebugLoc &DL = Inst.getDebugLoc();
8356
8357 MachineOperand &Dest = Inst.getOperand(0);
8358 MachineOperand &Src = Inst.getOperand(1);
8359
8360 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8361 const TargetRegisterClass *SrcRC = Src.isReg() ?
8362 MRI.getRegClass(Src.getReg()) :
8363 &AMDGPU::SGPR_32RegClass;
8364
8365 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8366 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8367
8368 const TargetRegisterClass *SrcSubRC =
8369 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8370
8371 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8372 AMDGPU::sub0, SrcSubRC);
8373 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8374 AMDGPU::sub1, SrcSubRC);
8375
8376 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8377
8378 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8379
8380 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8381
8382 // We don't need to legalize operands here. src0 for either instruction can be
8383 // an SGPR, and the second input is unused or determined here.
8384 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8385}
8386
8387void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8388 MachineInstr &Inst) const {
8389 MachineBasicBlock &MBB = *Inst.getParent();
8391 MachineBasicBlock::iterator MII = Inst;
8392 const DebugLoc &DL = Inst.getDebugLoc();
8393
8394 MachineOperand &Dest = Inst.getOperand(0);
8395 uint32_t Imm = Inst.getOperand(2).getImm();
8396 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8397 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8398
8399 (void) Offset;
8400
8401 // Only sext_inreg cases handled.
8402 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8403 Offset == 0 && "Not implemented");
8404
8405 if (BitWidth < 32) {
8406 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8407 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8408 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8409
8410 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8411 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8412 .addImm(0)
8413 .addImm(BitWidth);
8414
8415 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8416 .addImm(31)
8417 .addReg(MidRegLo);
8418
8419 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8420 .addReg(MidRegLo)
8421 .addImm(AMDGPU::sub0)
8422 .addReg(MidRegHi)
8423 .addImm(AMDGPU::sub1);
8424
8425 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8426 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8427 return;
8428 }
8429
8430 MachineOperand &Src = Inst.getOperand(1);
8431 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8432 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8433
8434 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8435 .addImm(31)
8436 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8437
8438 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8439 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8440 .addImm(AMDGPU::sub0)
8441 .addReg(TmpReg)
8442 .addImm(AMDGPU::sub1);
8443
8444 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8445 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8446}
8447
8448void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8449 MachineInstr &Inst, unsigned Opcode,
8450 MachineDominatorTree *MDT) const {
8451 // (S_FLBIT_I32_B64 hi:lo) ->
8452 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8453 // (S_FF1_I32_B64 hi:lo) ->
8454 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8455
8456 MachineBasicBlock &MBB = *Inst.getParent();
8458 MachineBasicBlock::iterator MII = Inst;
8459 const DebugLoc &DL = Inst.getDebugLoc();
8460
8461 MachineOperand &Dest = Inst.getOperand(0);
8462 MachineOperand &Src = Inst.getOperand(1);
8463
8464 const MCInstrDesc &InstDesc = get(Opcode);
8465
8466 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8467 unsigned OpcodeAdd =
8468 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8469
8470 const TargetRegisterClass *SrcRC =
8471 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8472 const TargetRegisterClass *SrcSubRC =
8473 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8474
8475 MachineOperand SrcRegSub0 =
8476 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8477 MachineOperand SrcRegSub1 =
8478 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8479
8480 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8481 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8482 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8483 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8484
8485 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8486
8487 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8488
8489 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8490 .addReg(IsCtlz ? MidReg1 : MidReg2)
8491 .addImm(32)
8492 .addImm(1); // enable clamp
8493
8494 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8495 .addReg(MidReg3)
8496 .addReg(IsCtlz ? MidReg2 : MidReg1);
8497
8498 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8499
8500 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8501}
8502
8503void SIInstrInfo::addUsersToMoveToVALUWorklist(
8505 SIInstrWorklist &Worklist) const {
8506 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8507 E = MRI.use_end(); I != E;) {
8508 MachineInstr &UseMI = *I->getParent();
8509
8510 unsigned OpNo = 0;
8511
8512 switch (UseMI.getOpcode()) {
8513 case AMDGPU::COPY:
8514 case AMDGPU::WQM:
8515 case AMDGPU::SOFT_WQM:
8516 case AMDGPU::STRICT_WWM:
8517 case AMDGPU::STRICT_WQM:
8518 case AMDGPU::REG_SEQUENCE:
8519 case AMDGPU::PHI:
8520 case AMDGPU::INSERT_SUBREG:
8521 break;
8522 default:
8523 OpNo = I.getOperandNo();
8524 break;
8525 }
8526
8527 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8528 Worklist.insert(&UseMI);
8529
8530 do {
8531 ++I;
8532 } while (I != E && I->getParent() == &UseMI);
8533 } else {
8534 ++I;
8535 }
8536 }
8537}
8538
8539void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8541 MachineInstr &Inst) const {
8542 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8544 MachineOperand &Src0 = Inst.getOperand(1);
8545 MachineOperand &Src1 = Inst.getOperand(2);
8546 const DebugLoc &DL = Inst.getDebugLoc();
8547
8548 switch (Inst.getOpcode()) {
8549 case AMDGPU::S_PACK_LL_B32_B16: {
8550 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8551 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8552
8553 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8554 // 0.
8555 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8556 .addImm(0xffff);
8557
8558 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8559 .addReg(ImmReg, RegState::Kill)
8560 .add(Src0);
8561
8562 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8563 .add(Src1)
8564 .addImm(16)
8565 .addReg(TmpReg, RegState::Kill);
8566 break;
8567 }
8568 case AMDGPU::S_PACK_LH_B32_B16: {
8569 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8570 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8571 .addImm(0xffff);
8572 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8573 .addReg(ImmReg, RegState::Kill)
8574 .add(Src0)
8575 .add(Src1);
8576 break;
8577 }
8578 case AMDGPU::S_PACK_HL_B32_B16: {
8579 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8580 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8581 .addImm(16)
8582 .add(Src0);
8583 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8584 .add(Src1)
8585 .addImm(16)
8586 .addReg(TmpReg, RegState::Kill);
8587 break;
8588 }
8589 case AMDGPU::S_PACK_HH_B32_B16: {
8590 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8591 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8592 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8593 .addImm(16)
8594 .add(Src0);
8595 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8596 .addImm(0xffff0000);
8597 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8598 .add(Src1)
8599 .addReg(ImmReg, RegState::Kill)
8600 .addReg(TmpReg, RegState::Kill);
8601 break;
8602 }
8603 default:
8604 llvm_unreachable("unhandled s_pack_* instruction");
8605 }
8606
8607 MachineOperand &Dest = Inst.getOperand(0);
8608 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8609 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8610}
8611
8612void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8613 MachineInstr &SCCDefInst,
8614 SIInstrWorklist &Worklist,
8615 Register NewCond) const {
8616
8617 // Ensure that def inst defines SCC, which is still live.
8618 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8619 !Op.isDead() && Op.getParent() == &SCCDefInst);
8620 SmallVector<MachineInstr *, 4> CopyToDelete;
8621 // This assumes that all the users of SCC are in the same block
8622 // as the SCC def.
8623 for (MachineInstr &MI : // Skip the def inst itself.
8624 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8625 SCCDefInst.getParent()->end())) {
8626 // Check if SCC is used first.
8627 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
8628 if (SCCIdx != -1) {
8629 if (MI.isCopy()) {
8630 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8631 Register DestReg = MI.getOperand(0).getReg();
8632
8633 MRI.replaceRegWith(DestReg, NewCond);
8634 CopyToDelete.push_back(&MI);
8635 } else {
8636
8637 if (NewCond.isValid())
8638 MI.getOperand(SCCIdx).setReg(NewCond);
8639
8640 Worklist.insert(&MI);
8641 }
8642 }
8643 // Exit if we find another SCC def.
8644 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8645 break;
8646 }
8647 for (auto &Copy : CopyToDelete)
8648 Copy->eraseFromParent();
8649}
8650
8651// Instructions that use SCC may be converted to VALU instructions. When that
8652// happens, the SCC register is changed to VCC_LO. The instruction that defines
8653// SCC must be changed to an instruction that defines VCC. This function makes
8654// sure that the instruction that defines SCC is added to the moveToVALU
8655// worklist.
8656void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8657 SIInstrWorklist &Worklist) const {
8658 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8659 // then there is nothing to do because the defining instruction has been
8660 // converted to a VALU already. If SCC then that instruction needs to be
8661 // converted to a VALU.
8662 for (MachineInstr &MI :
8663 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8664 SCCUseInst->getParent()->rend())) {
8665 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8666 break;
8667 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8668 Worklist.insert(&MI);
8669 break;
8670 }
8671 }
8672}
8673
8674const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8675 const MachineInstr &Inst) const {
8676 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8677
8678 switch (Inst.getOpcode()) {
8679 // For target instructions, getOpRegClass just returns the virtual register
8680 // class associated with the operand, so we need to find an equivalent VGPR
8681 // register class in order to move the instruction to the VALU.
8682 case AMDGPU::COPY:
8683 case AMDGPU::PHI:
8684 case AMDGPU::REG_SEQUENCE:
8685 case AMDGPU::INSERT_SUBREG:
8686 case AMDGPU::WQM:
8687 case AMDGPU::SOFT_WQM:
8688 case AMDGPU::STRICT_WWM:
8689 case AMDGPU::STRICT_WQM: {
8690 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8691 if (RI.isAGPRClass(SrcRC)) {
8692 if (RI.isAGPRClass(NewDstRC))
8693 return nullptr;
8694
8695 switch (Inst.getOpcode()) {
8696 case AMDGPU::PHI:
8697 case AMDGPU::REG_SEQUENCE:
8698 case AMDGPU::INSERT_SUBREG:
8699 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8700 break;
8701 default:
8702 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8703 }
8704
8705 if (!NewDstRC)
8706 return nullptr;
8707 } else {
8708 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8709 return nullptr;
8710
8711 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8712 if (!NewDstRC)
8713 return nullptr;
8714 }
8715
8716 return NewDstRC;
8717 }
8718 default:
8719 return NewDstRC;
8720 }
8721}
8722
8723// Find the one SGPR operand we are allowed to use.
8724Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8725 int OpIndices[3]) const {
8726 const MCInstrDesc &Desc = MI.getDesc();
8727
8728 // Find the one SGPR operand we are allowed to use.
8729 //
8730 // First we need to consider the instruction's operand requirements before
8731 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8732 // of VCC, but we are still bound by the constant bus requirement to only use
8733 // one.
8734 //
8735 // If the operand's class is an SGPR, we can never move it.
8736
8737 Register SGPRReg = findImplicitSGPRRead(MI);
8738 if (SGPRReg)
8739 return SGPRReg;
8740
8741 Register UsedSGPRs[3] = {Register()};
8742 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8743
8744 for (unsigned i = 0; i < 3; ++i) {
8745 int Idx = OpIndices[i];
8746 if (Idx == -1)
8747 break;
8748
8749 const MachineOperand &MO = MI.getOperand(Idx);
8750 if (!MO.isReg())
8751 continue;
8752
8753 // Is this operand statically required to be an SGPR based on the operand
8754 // constraints?
8755 const TargetRegisterClass *OpRC =
8756 RI.getRegClass(Desc.operands()[Idx].RegClass);
8757 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8758 if (IsRequiredSGPR)
8759 return MO.getReg();
8760
8761 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8762 Register Reg = MO.getReg();
8763 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8764 if (RI.isSGPRClass(RegRC))
8765 UsedSGPRs[i] = Reg;
8766 }
8767
8768 // We don't have a required SGPR operand, so we have a bit more freedom in
8769 // selecting operands to move.
8770
8771 // Try to select the most used SGPR. If an SGPR is equal to one of the
8772 // others, we choose that.
8773 //
8774 // e.g.
8775 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8776 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8777
8778 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8779 // prefer those.
8780
8781 if (UsedSGPRs[0]) {
8782 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8783 SGPRReg = UsedSGPRs[0];
8784 }
8785
8786 if (!SGPRReg && UsedSGPRs[1]) {
8787 if (UsedSGPRs[1] == UsedSGPRs[2])
8788 SGPRReg = UsedSGPRs[1];
8789 }
8790
8791 return SGPRReg;
8792}
8793
8795 unsigned OperandName) const {
8796 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8797 if (Idx == -1)
8798 return nullptr;
8799
8800 return &MI.getOperand(Idx);
8801}
8802
8808 return (Format << 44) |
8809 (1ULL << 56) | // RESOURCE_LEVEL = 1
8810 (3ULL << 60); // OOB_SELECT = 3
8811 }
8812
8813 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8814 if (ST.isAmdHsaOS()) {
8815 // Set ATC = 1. GFX9 doesn't have this bit.
8817 RsrcDataFormat |= (1ULL << 56);
8818
8819 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8820 // BTW, it disables TC L2 and therefore decreases performance.
8822 RsrcDataFormat |= (2ULL << 59);
8823 }
8824
8825 return RsrcDataFormat;
8826}
8827
8831 0xffffffff; // Size;
8832
8833 // GFX9 doesn't have ELEMENT_SIZE.
8835 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8836 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8837 }
8838
8839 // IndexStride = 64 / 32.
8840 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
8841 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8842
8843 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8844 // Clear them unless we want a huge stride.
8847 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8848
8849 return Rsrc23;
8850}
8851
8853 unsigned Opc = MI.getOpcode();
8854
8855 return isSMRD(Opc);
8856}
8857
8859 return get(Opc).mayLoad() &&
8860 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8861}
8862
8864 int &FrameIndex) const {
8865 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8866 if (!Addr || !Addr->isFI())
8867 return Register();
8868
8869 assert(!MI.memoperands_empty() &&
8870 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8871
8872 FrameIndex = Addr->getIndex();
8873 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8874}
8875
8877 int &FrameIndex) const {
8878 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8879 assert(Addr && Addr->isFI());
8880 FrameIndex = Addr->getIndex();
8881 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8882}
8883
8885 int &FrameIndex) const {
8886 if (!MI.mayLoad())
8887 return Register();
8888
8889 if (isMUBUF(MI) || isVGPRSpill(MI))
8890 return isStackAccess(MI, FrameIndex);
8891
8892 if (isSGPRSpill(MI))
8893 return isSGPRStackAccess(MI, FrameIndex);
8894
8895 return Register();
8896}
8897
8899 int &FrameIndex) const {
8900 if (!MI.mayStore())
8901 return Register();
8902
8903 if (isMUBUF(MI) || isVGPRSpill(MI))
8904 return isStackAccess(MI, FrameIndex);
8905
8906 if (isSGPRSpill(MI))
8907 return isSGPRStackAccess(MI, FrameIndex);
8908
8909 return Register();
8910}
8911
8913 unsigned Size = 0;
8915 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8916 while (++I != E && I->isInsideBundle()) {
8917 assert(!I->isBundle() && "No nested bundle!");
8919 }
8920
8921 return Size;
8922}
8923
8925 unsigned Opc = MI.getOpcode();
8927 unsigned DescSize = Desc.getSize();
8928
8929 // If we have a definitive size, we can use it. Otherwise we need to inspect
8930 // the operands to know the size.
8931 if (isFixedSize(MI)) {
8932 unsigned Size = DescSize;
8933
8934 // If we hit the buggy offset, an extra nop will be inserted in MC so
8935 // estimate the worst case.
8936 if (MI.isBranch() && ST.hasOffset3fBug())
8937 Size += 4;
8938
8939 return Size;
8940 }
8941
8942 // Instructions may have a 32-bit literal encoded after them. Check
8943 // operands that could ever be literals.
8944 if (isVALU(MI) || isSALU(MI)) {
8945 if (isDPP(MI))
8946 return DescSize;
8947 bool HasLiteral = false;
8948 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8949 const MachineOperand &Op = MI.getOperand(I);
8950 const MCOperandInfo &OpInfo = Desc.operands()[I];
8951 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8952 HasLiteral = true;
8953 break;
8954 }
8955 }
8956 return HasLiteral ? DescSize + 4 : DescSize;
8957 }
8958
8959 // Check whether we have extra NSA words.
8960 if (isMIMG(MI)) {
8961 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8962 if (VAddr0Idx < 0)
8963 return 8;
8964
8965 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8966 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8967 }
8968
8969 switch (Opc) {
8970 case TargetOpcode::BUNDLE:
8971 return getInstBundleSize(MI);
8972 case TargetOpcode::INLINEASM:
8973 case TargetOpcode::INLINEASM_BR: {
8974 const MachineFunction *MF = MI.getParent()->getParent();
8975 const char *AsmStr = MI.getOperand(0).getSymbolName();
8976 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8977 }
8978 default:
8979 if (MI.isMetaInstruction())
8980 return 0;
8981 return DescSize;
8982 }
8983}
8984
8986 if (!isFLAT(MI))
8987 return false;
8988
8989 if (MI.memoperands_empty())
8990 return true;
8991
8992 for (const MachineMemOperand *MMO : MI.memoperands()) {
8993 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8994 return true;
8995 }
8996 return false;
8997}
8998
9001 static const std::pair<int, const char *> TargetIndices[] = {
9002 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9003 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9004 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9005 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9006 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9007 return ArrayRef(TargetIndices);
9008}
9009
9010/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9011/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9014 const ScheduleDAG *DAG) const {
9015 return new GCNHazardRecognizer(DAG->MF);
9016}
9017
9018/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9019/// pass.
9022 return new GCNHazardRecognizer(MF);
9023}
9024
9025// Called during:
9026// - pre-RA scheduling and post-RA scheduling
9029 const ScheduleDAGMI *DAG) const {
9030 // Borrowed from Arm Target
9031 // We would like to restrict this hazard recognizer to only
9032 // post-RA scheduling; we can tell that we're post-RA because we don't
9033 // track VRegLiveness.
9034 if (!DAG->hasVRegLiveness())
9035 return new GCNHazardRecognizer(DAG->MF);
9037}
9038
9039std::pair<unsigned, unsigned>
9041 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9042}
9043
9046 static const std::pair<unsigned, const char *> TargetFlags[] = {
9047 { MO_GOTPCREL, "amdgpu-gotprel" },
9048 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
9049 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
9050 { MO_REL32_LO, "amdgpu-rel32-lo" },
9051 { MO_REL32_HI, "amdgpu-rel32-hi" },
9052 { MO_ABS32_LO, "amdgpu-abs32-lo" },
9053 { MO_ABS32_HI, "amdgpu-abs32-hi" },
9054 };
9055
9056 return ArrayRef(TargetFlags);
9057}
9058
9061 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9062 {
9063 {MONoClobber, "amdgpu-noclobber"},
9064 {MOLastUse, "amdgpu-last-use"},
9065 };
9066
9067 return ArrayRef(TargetFlags);
9068}
9069
9071 const MachineFunction &MF) const {
9073 assert(SrcReg.isVirtual());
9074 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9075 return AMDGPU::WWM_COPY;
9076
9077 return AMDGPU::COPY;
9078}
9079
9081 Register Reg) const {
9082 // We need to handle instructions which may be inserted during register
9083 // allocation to handle the prolog. The initial prolog instruction may have
9084 // been separated from the start of the block by spills and copies inserted
9085 // needed by the prolog. However, the insertions for scalar registers can
9086 // always be placed at the BB top as they are independent of the exec mask
9087 // value.
9088 const MachineFunction *MF = MI.getParent()->getParent();
9089 bool IsNullOrVectorRegister = true;
9090 if (Reg) {
9091 const MachineRegisterInfo &MRI = MF->getRegInfo();
9092 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9093 }
9094
9095 uint16_t Opcode = MI.getOpcode();
9097 return IsNullOrVectorRegister &&
9098 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9099 (Opcode == AMDGPU::IMPLICIT_DEF &&
9100 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9101 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9102 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9103}
9104
9108 const DebugLoc &DL,
9109 Register DestReg) const {
9110 if (ST.hasAddNoCarry())
9111 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9112
9114 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9115 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9116
9117 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9118 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9119}
9120
9123 const DebugLoc &DL,
9124 Register DestReg,
9125 RegScavenger &RS) const {
9126 if (ST.hasAddNoCarry())
9127 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9128
9129 // If available, prefer to use vcc.
9130 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9131 ? Register(RI.getVCC())
9133 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9134 0, /* AllowSpill */ false);
9135
9136 // TODO: Users need to deal with this.
9137 if (!UnusedCarry.isValid())
9138 return MachineInstrBuilder();
9139
9140 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9141 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9142}
9143
9144bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9145 switch (Opcode) {
9146 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9147 case AMDGPU::SI_KILL_I1_TERMINATOR:
9148 return true;
9149 default:
9150 return false;
9151 }
9152}
9153
9155 switch (Opcode) {
9156 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9157 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9158 case AMDGPU::SI_KILL_I1_PSEUDO:
9159 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9160 default:
9161 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9162 }
9163}
9164
9165bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9166 return Imm <= getMaxMUBUFImmOffset(ST);
9167}
9168
9170 // GFX12 field is non-negative 24-bit signed byte offset.
9171 const unsigned OffsetBits =
9172 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9173 return (1 << OffsetBits) - 1;
9174}
9175
9177 if (!ST.isWave32())
9178 return;
9179
9180 if (MI.isInlineAsm())
9181 return;
9182
9183 for (auto &Op : MI.implicit_operands()) {
9184 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9185 Op.setReg(AMDGPU::VCC_LO);
9186 }
9187}
9188
9190 if (!isSMRD(MI))
9191 return false;
9192
9193 // Check that it is using a buffer resource.
9194 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9195 if (Idx == -1) // e.g. s_memtime
9196 return false;
9197
9198 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9199 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9200}
9201
9202// Given Imm, split it into the values to put into the SOffset and ImmOffset
9203// fields in an MUBUF instruction. Return false if it is not possible (due to a
9204// hardware bug needing a workaround).
9205//
9206// The required alignment ensures that individual address components remain
9207// aligned if they are aligned to begin with. It also ensures that additional
9208// offsets within the given alignment can be added to the resulting ImmOffset.
9210 uint32_t &ImmOffset, Align Alignment) const {
9211 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9212 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9213 uint32_t Overflow = 0;
9214
9215 if (Imm > MaxImm) {
9216 if (Imm <= MaxImm + 64) {
9217 // Use an SOffset inline constant for 4..64
9218 Overflow = Imm - MaxImm;
9219 Imm = MaxImm;
9220 } else {
9221 // Try to keep the same value in SOffset for adjacent loads, so that
9222 // the corresponding register contents can be re-used.
9223 //
9224 // Load values with all low-bits (except for alignment bits) set into
9225 // SOffset, so that a larger range of values can be covered using
9226 // s_movk_i32.
9227 //
9228 // Atomic operations fail to work correctly when individual address
9229 // components are unaligned, even if their sum is aligned.
9230 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9231 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9232 Imm = Low;
9233 Overflow = High - Alignment.value();
9234 }
9235 }
9236
9237 if (Overflow > 0) {
9238 // There is a hardware bug in SI and CI which prevents address clamping in
9239 // MUBUF instructions from working correctly with SOffsets. The immediate
9240 // offset is unaffected.
9242 return false;
9243
9244 // It is not possible to set immediate in SOffset field on some targets.
9245 if (ST.hasRestrictedSOffset())
9246 return false;
9247 }
9248
9249 ImmOffset = Imm;
9250 SOffset = Overflow;
9251 return true;
9252}
9253
9254// Depending on the used address space and instructions, some immediate offsets
9255// are allowed and some are not.
9256// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9257// scratch instruction offsets can also be negative. On GFX12, offsets can be
9258// negative for all variants.
9259//
9260// There are several bugs related to these offsets:
9261// On gfx10.1, flat instructions that go into the global address space cannot
9262// use an offset.
9263//
9264// For scratch instructions, the address can be either an SGPR or a VGPR.
9265// The following offsets can be used, depending on the architecture (x means
9266// cannot be used):
9267// +----------------------------+------+------+
9268// | Address-Mode | SGPR | VGPR |
9269// +----------------------------+------+------+
9270// | gfx9 | | |
9271// | negative, 4-aligned offset | x | ok |
9272// | negative, unaligned offset | x | ok |
9273// +----------------------------+------+------+
9274// | gfx10 | | |
9275// | negative, 4-aligned offset | ok | ok |
9276// | negative, unaligned offset | ok | x |
9277// +----------------------------+------+------+
9278// | gfx10.3 | | |
9279// | negative, 4-aligned offset | ok | ok |
9280// | negative, unaligned offset | ok | ok |
9281// +----------------------------+------+------+
9282//
9283// This function ignores the addressing mode, so if an offset cannot be used in
9284// one addressing mode, it is considered illegal.
9285bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9286 uint64_t FlatVariant) const {
9287 // TODO: Should 0 be special cased?
9288 if (!ST.hasFlatInstOffsets())
9289 return false;
9290
9291 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9292 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9293 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9294 return false;
9295
9297 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9298 (Offset % 4) != 0) {
9299 return false;
9300 }
9301
9302 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9303 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9304 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9305}
9306
9307// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9308std::pair<int64_t, int64_t>
9309SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9310 uint64_t FlatVariant) const {
9311 int64_t RemainderOffset = COffsetVal;
9312 int64_t ImmField = 0;
9313
9314 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9315 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9316
9317 if (AllowNegative) {
9318 // Use signed division by a power of two to truncate towards 0.
9319 int64_t D = 1LL << NumBits;
9320 RemainderOffset = (COffsetVal / D) * D;
9321 ImmField = COffsetVal - RemainderOffset;
9322
9324 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9325 (ImmField % 4) != 0) {
9326 // Make ImmField a multiple of 4
9327 RemainderOffset += ImmField % 4;
9328 ImmField -= ImmField % 4;
9329 }
9330 } else if (COffsetVal >= 0) {
9331 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9332 RemainderOffset = COffsetVal - ImmField;
9333 }
9334
9335 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9336 assert(RemainderOffset + ImmField == COffsetVal);
9337 return {ImmField, RemainderOffset};
9338}
9339
9341 if (ST.hasNegativeScratchOffsetBug() &&
9342 FlatVariant == SIInstrFlags::FlatScratch)
9343 return false;
9344
9345 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9346}
9347
9348static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9349 switch (ST.getGeneration()) {
9350 default:
9351 break;
9354 return SIEncodingFamily::SI;
9357 return SIEncodingFamily::VI;
9364 }
9365 llvm_unreachable("Unknown subtarget generation!");
9366}
9367
9368bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9369 switch(MCOp) {
9370 // These opcodes use indirect register addressing so
9371 // they need special handling by codegen (currently missing).
9372 // Therefore it is too risky to allow these opcodes
9373 // to be selected by dpp combiner or sdwa peepholer.
9374 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9375 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9376 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9377 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9378 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9379 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9380 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9381 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9382 return true;
9383 default:
9384 return false;
9385 }
9386}
9387
9388#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9389 case OPCODE##_dpp: \
9390 case OPCODE##_e32: \
9391 case OPCODE##_e64: \
9392 case OPCODE##_e64_dpp: \
9393 case OPCODE##_sdwa:
9394
9395static bool isRenamedInGFX9(int Opcode) {
9396 switch (Opcode) {
9397 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9398 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9399 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9400 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9401 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9402 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9403 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9404 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9405 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9406 //
9407 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
9408 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
9409 case AMDGPU::V_FMA_F16_gfx9_e64:
9410 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
9411 case AMDGPU::V_INTERP_P2_F16:
9412 case AMDGPU::V_MAD_F16_e64:
9413 case AMDGPU::V_MAD_U16_e64:
9414 case AMDGPU::V_MAD_I16_e64:
9415 return true;
9416 default:
9417 return false;
9418 }
9419}
9420
9421int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9422 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9423
9424 unsigned Gen = subtargetEncodingFamily(ST);
9425
9428
9429 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9430 // subtarget has UnpackedD16VMem feature.
9431 // TODO: remove this when we discard GFX80 encoding.
9432 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9434
9435 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9436 switch (ST.getGeneration()) {
9437 default:
9439 break;
9442 break;
9445 break;
9446 }
9447 }
9448
9449 if (isMAI(Opcode)) {
9450 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9451 if (MFMAOp != -1)
9452 Opcode = MFMAOp;
9453 }
9454
9455 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9456
9457 // -1 means that Opcode is already a native instruction.
9458 if (MCOp == -1)
9459 return Opcode;
9460
9461 if (ST.hasGFX90AInsts()) {
9462 uint16_t NMCOp = (uint16_t)-1;
9463 if (ST.hasGFX940Insts())
9465 if (NMCOp == (uint16_t)-1)
9467 if (NMCOp == (uint16_t)-1)
9469 if (NMCOp != (uint16_t)-1)
9470 MCOp = NMCOp;
9471 }
9472
9473 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9474 // no encoding in the given subtarget generation.
9475 if (MCOp == (uint16_t)-1)
9476 return -1;
9477
9478 if (isAsmOnlyOpcode(MCOp))
9479 return -1;
9480
9481 return MCOp;
9482}
9483
9484static
9486 assert(RegOpnd.isReg());
9487 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9488 getRegSubRegPair(RegOpnd);
9489}
9490
9493 assert(MI.isRegSequence());
9494 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9495 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9496 auto &RegOp = MI.getOperand(1 + 2 * I);
9497 return getRegOrUndef(RegOp);
9498 }
9500}
9501
9502// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9503// Following a subreg of reg:subreg isn't supported
9506 if (!RSR.SubReg)
9507 return false;
9508 switch (MI.getOpcode()) {
9509 default: break;
9510 case AMDGPU::REG_SEQUENCE:
9511 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9512 return true;
9513 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9514 case AMDGPU::INSERT_SUBREG:
9515 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9516 // inserted the subreg we're looking for
9517 RSR = getRegOrUndef(MI.getOperand(2));
9518 else { // the subreg in the rest of the reg
9519 auto R1 = getRegOrUndef(MI.getOperand(1));
9520 if (R1.SubReg) // subreg of subreg isn't supported
9521 return false;
9522 RSR.Reg = R1.Reg;
9523 }
9524 return true;
9525 }
9526 return false;
9527}
9528
9531 assert(MRI.isSSA());
9532 if (!P.Reg.isVirtual())
9533 return nullptr;
9534
9535 auto RSR = P;
9536 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9537 while (auto *MI = DefInst) {
9538 DefInst = nullptr;
9539 switch (MI->getOpcode()) {
9540 case AMDGPU::COPY:
9541 case AMDGPU::V_MOV_B32_e32: {
9542 auto &Op1 = MI->getOperand(1);
9543 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9544 if (Op1.isUndef())
9545 return nullptr;
9546 RSR = getRegSubRegPair(Op1);
9547 DefInst = MRI.getVRegDef(RSR.Reg);
9548 }
9549 break;
9550 }
9551 default:
9552 if (followSubRegDef(*MI, RSR)) {
9553 if (!RSR.Reg)
9554 return nullptr;
9555 DefInst = MRI.getVRegDef(RSR.Reg);
9556 }
9557 }
9558 if (!DefInst)
9559 return MI;
9560 }
9561 return nullptr;
9562}
9563
9565 Register VReg,
9566 const MachineInstr &DefMI,
9567 const MachineInstr &UseMI) {
9568 assert(MRI.isSSA() && "Must be run on SSA");
9569
9570 auto *TRI = MRI.getTargetRegisterInfo();
9571 auto *DefBB = DefMI.getParent();
9572
9573 // Don't bother searching between blocks, although it is possible this block
9574 // doesn't modify exec.
9575 if (UseMI.getParent() != DefBB)
9576 return true;
9577
9578 const int MaxInstScan = 20;
9579 int NumInst = 0;
9580
9581 // Stop scan at the use.
9582 auto E = UseMI.getIterator();
9583 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9584 if (I->isDebugInstr())
9585 continue;
9586
9587 if (++NumInst > MaxInstScan)
9588 return true;
9589
9590 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9591 return true;
9592 }
9593
9594 return false;
9595}
9596
9598 Register VReg,
9599 const MachineInstr &DefMI) {
9600 assert(MRI.isSSA() && "Must be run on SSA");
9601
9602 auto *TRI = MRI.getTargetRegisterInfo();
9603 auto *DefBB = DefMI.getParent();
9604
9605 const int MaxUseScan = 10;
9606 int NumUse = 0;
9607
9608 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9609 auto &UseInst = *Use.getParent();
9610 // Don't bother searching between blocks, although it is possible this block
9611 // doesn't modify exec.
9612 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9613 return true;
9614
9615 if (++NumUse > MaxUseScan)
9616 return true;
9617 }
9618
9619 if (NumUse == 0)
9620 return false;
9621
9622 const int MaxInstScan = 20;
9623 int NumInst = 0;
9624
9625 // Stop scan when we have seen all the uses.
9626 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9627 assert(I != DefBB->end());
9628
9629 if (I->isDebugInstr())
9630 continue;
9631
9632 if (++NumInst > MaxInstScan)
9633 return true;
9634
9635 for (const MachineOperand &Op : I->operands()) {
9636 // We don't check reg masks here as they're used only on calls:
9637 // 1. EXEC is only considered const within one BB
9638 // 2. Call should be a terminator instruction if present in a BB
9639
9640 if (!Op.isReg())
9641 continue;
9642
9643 Register Reg = Op.getReg();
9644 if (Op.isUse()) {
9645 if (Reg == VReg && --NumUse == 0)
9646 return false;
9647 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9648 return true;
9649 }
9650 }
9651}
9652
9655 const DebugLoc &DL, Register Src, Register Dst) const {
9656 auto Cur = MBB.begin();
9657 if (Cur != MBB.end())
9658 do {
9659 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9660 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9661 ++Cur;
9662 } while (Cur != MBB.end() && Cur != LastPHIIt);
9663
9664 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9665 Dst);
9666}
9667
9670 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9671 if (InsPt != MBB.end() &&
9672 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9673 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9674 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9675 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9676 InsPt++;
9677 return BuildMI(MBB, InsPt, DL,
9678 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9679 : AMDGPU::S_MOV_B64_term),
9680 Dst)
9681 .addReg(Src, 0, SrcSubReg)
9682 .addReg(AMDGPU::EXEC, RegState::Implicit);
9683 }
9684 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9685 Dst);
9686}
9687
9688bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9689
9692 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9693 VirtRegMap *VRM) const {
9694 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9695 //
9696 // %0:sreg_32 = COPY $m0
9697 //
9698 // We explicitly chose SReg_32 for the virtual register so such a copy might
9699 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9700 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9701 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9702 // TargetInstrInfo::foldMemoryOperand() is going to try.
9703 // A similar issue also exists with spilling and reloading $exec registers.
9704 //
9705 // To prevent that, constrain the %0 register class here.
9706 if (isFullCopyInstr(MI)) {
9707 Register DstReg = MI.getOperand(0).getReg();
9708 Register SrcReg = MI.getOperand(1).getReg();
9709 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9710 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9712 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9713 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9714 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9715 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9716 return nullptr;
9717 }
9718 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9719 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9720 return nullptr;
9721 }
9722 }
9723 }
9724
9725 return nullptr;
9726}
9727
9729 const MachineInstr &MI,
9730 unsigned *PredCost) const {
9731 if (MI.isBundle()) {
9733 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9734 unsigned Lat = 0, Count = 0;
9735 for (++I; I != E && I->isBundledWithPred(); ++I) {
9736 ++Count;
9737 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9738 }
9739 return Lat + Count - 1;
9740 }
9741
9742 return SchedModel.computeInstrLatency(&MI);
9743}
9744
9747 unsigned opcode = MI.getOpcode();
9748 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9749 auto IID = GI->getIntrinsicID();
9754
9755 switch (IID) {
9756 case Intrinsic::amdgcn_if:
9757 case Intrinsic::amdgcn_else:
9758 // FIXME: Uniform if second result
9759 break;
9760 }
9761
9763 }
9764
9765 // Loads from the private and flat address spaces are divergent, because
9766 // threads can execute the load instruction with the same inputs and get
9767 // different results.
9768 //
9769 // All other loads are not divergent, because if threads issue loads with the
9770 // same arguments, they will always get the same result.
9771 if (opcode == AMDGPU::G_LOAD) {
9772 if (MI.memoperands_empty())
9773 return InstructionUniformity::NeverUniform; // conservative assumption
9774
9775 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9776 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9777 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9778 })) {
9779 // At least one MMO in a non-global address space.
9781 }
9783 }
9784
9785 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9786 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9787 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9788 AMDGPU::isGenericAtomic(opcode)) {
9790 }
9792}
9793
9796
9797 if (isNeverUniform(MI))
9799
9800 unsigned opcode = MI.getOpcode();
9801 if (opcode == AMDGPU::V_READLANE_B32 ||
9802 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9803 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9805
9806 if (isCopyInstr(MI)) {
9807 const MachineOperand &srcOp = MI.getOperand(1);
9808 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9809 const TargetRegisterClass *regClass =
9810 RI.getPhysRegBaseClass(srcOp.getReg());
9813 }
9815 }
9816
9817 // GMIR handling
9818 if (MI.isPreISelOpcode())
9820
9821 // Atomics are divergent because they are executed sequentially: when an
9822 // atomic operation refers to the same address in each thread, then each
9823 // thread after the first sees the value written by the previous thread as
9824 // original value.
9825
9826 if (isAtomic(MI))
9828
9829 // Loads from the private and flat address spaces are divergent, because
9830 // threads can execute the load instruction with the same inputs and get
9831 // different results.
9832 if (isFLAT(MI) && MI.mayLoad()) {
9833 if (MI.memoperands_empty())
9834 return InstructionUniformity::NeverUniform; // conservative assumption
9835
9836 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9837 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9838 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9839 })) {
9840 // At least one MMO in a non-global address space.
9842 }
9843
9845 }
9846
9847 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9848 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9849
9850 // FIXME: It's conceptually broken to report this for an instruction, and not
9851 // a specific def operand. For inline asm in particular, there could be mixed
9852 // uniform and divergent results.
9853 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9854 const MachineOperand &SrcOp = MI.getOperand(I);
9855 if (!SrcOp.isReg())
9856 continue;
9857
9858 Register Reg = SrcOp.getReg();
9859 if (!Reg || !SrcOp.readsReg())
9860 continue;
9861
9862 // If RegBank is null, this is unassigned or an unallocatable special
9863 // register, which are all scalars.
9864 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9865 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9867 }
9868
9869 // TODO: Uniformity check condtions above can be rearranged for more
9870 // redability
9871
9872 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9873 // currently turned into no-op COPYs by SelectionDAG ISel and are
9874 // therefore no longer recognizable.
9875
9877}
9878
9880 switch (MF.getFunction().getCallingConv()) {
9882 return 1;
9884 return 2;
9886 return 3;
9890 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9893 case CallingConv::C:
9894 case CallingConv::Fast:
9895 default:
9896 // Assume other calling conventions are various compute callable functions
9897 return 0;
9898 }
9899}
9900
9902 Register &SrcReg2, int64_t &CmpMask,
9903 int64_t &CmpValue) const {
9904 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9905 return false;
9906
9907 switch (MI.getOpcode()) {
9908 default:
9909 break;
9910 case AMDGPU::S_CMP_EQ_U32:
9911 case AMDGPU::S_CMP_EQ_I32:
9912 case AMDGPU::S_CMP_LG_U32:
9913 case AMDGPU::S_CMP_LG_I32:
9914 case AMDGPU::S_CMP_LT_U32:
9915 case AMDGPU::S_CMP_LT_I32:
9916 case AMDGPU::S_CMP_GT_U32:
9917 case AMDGPU::S_CMP_GT_I32:
9918 case AMDGPU::S_CMP_LE_U32:
9919 case AMDGPU::S_CMP_LE_I32:
9920 case AMDGPU::S_CMP_GE_U32:
9921 case AMDGPU::S_CMP_GE_I32:
9922 case AMDGPU::S_CMP_EQ_U64:
9923 case AMDGPU::S_CMP_LG_U64:
9924 SrcReg = MI.getOperand(0).getReg();
9925 if (MI.getOperand(1).isReg()) {
9926 if (MI.getOperand(1).getSubReg())
9927 return false;
9928 SrcReg2 = MI.getOperand(1).getReg();
9929 CmpValue = 0;
9930 } else if (MI.getOperand(1).isImm()) {
9931 SrcReg2 = Register();
9932 CmpValue = MI.getOperand(1).getImm();
9933 } else {
9934 return false;
9935 }
9936 CmpMask = ~0;
9937 return true;
9938 case AMDGPU::S_CMPK_EQ_U32:
9939 case AMDGPU::S_CMPK_EQ_I32:
9940 case AMDGPU::S_CMPK_LG_U32:
9941 case AMDGPU::S_CMPK_LG_I32:
9942 case AMDGPU::S_CMPK_LT_U32:
9943 case AMDGPU::S_CMPK_LT_I32:
9944 case AMDGPU::S_CMPK_GT_U32:
9945 case AMDGPU::S_CMPK_GT_I32:
9946 case AMDGPU::S_CMPK_LE_U32:
9947 case AMDGPU::S_CMPK_LE_I32:
9948 case AMDGPU::S_CMPK_GE_U32:
9949 case AMDGPU::S_CMPK_GE_I32:
9950 SrcReg = MI.getOperand(0).getReg();
9951 SrcReg2 = Register();
9952 CmpValue = MI.getOperand(1).getImm();
9953 CmpMask = ~0;
9954 return true;
9955 }
9956
9957 return false;
9958}
9959
9961 Register SrcReg2, int64_t CmpMask,
9962 int64_t CmpValue,
9963 const MachineRegisterInfo *MRI) const {
9964 if (!SrcReg || SrcReg.isPhysical())
9965 return false;
9966
9967 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9968 return false;
9969
9970 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9971 this](int64_t ExpectedValue, unsigned SrcSize,
9972 bool IsReversible, bool IsSigned) -> bool {
9973 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9974 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9975 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9976 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9977 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9978 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9979 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9980 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9981 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9982 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9983 //
9984 // Signed ge/gt are not used for the sign bit.
9985 //
9986 // If result of the AND is unused except in the compare:
9987 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9988 //
9989 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9990 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9991 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9992 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9993 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9994 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9995
9996 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9997 if (!Def || Def->getParent() != CmpInstr.getParent())
9998 return false;
9999
10000 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10001 Def->getOpcode() != AMDGPU::S_AND_B64)
10002 return false;
10003
10004 int64_t Mask;
10005 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10006 if (MO->isImm())
10007 Mask = MO->getImm();
10008 else if (!getFoldableImm(MO, Mask))
10009 return false;
10010 Mask &= maxUIntN(SrcSize);
10011 return isPowerOf2_64(Mask);
10012 };
10013
10014 MachineOperand *SrcOp = &Def->getOperand(1);
10015 if (isMask(SrcOp))
10016 SrcOp = &Def->getOperand(2);
10017 else if (isMask(&Def->getOperand(2)))
10018 SrcOp = &Def->getOperand(1);
10019 else
10020 return false;
10021
10022 // A valid Mask is required to have a single bit set, hence a non-zero and
10023 // power-of-two value. This verifies that we will not do 64-bit shift below.
10024 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10025 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10026 if (IsSigned && BitNo == SrcSize - 1)
10027 return false;
10028
10029 ExpectedValue <<= BitNo;
10030
10031 bool IsReversedCC = false;
10032 if (CmpValue != ExpectedValue) {
10033 if (!IsReversible)
10034 return false;
10035 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10036 if (!IsReversedCC)
10037 return false;
10038 }
10039
10040 Register DefReg = Def->getOperand(0).getReg();
10041 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10042 return false;
10043
10044 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10045 I != E; ++I) {
10046 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10047 I->killsRegister(AMDGPU::SCC, &RI))
10048 return false;
10049 }
10050
10051 MachineOperand *SccDef =
10052 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10053 SccDef->setIsDead(false);
10054 CmpInstr.eraseFromParent();
10055
10056 if (!MRI->use_nodbg_empty(DefReg)) {
10057 assert(!IsReversedCC);
10058 return true;
10059 }
10060
10061 // Replace AND with unused result with a S_BITCMP.
10062 MachineBasicBlock *MBB = Def->getParent();
10063
10064 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10065 : AMDGPU::S_BITCMP1_B32
10066 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10067 : AMDGPU::S_BITCMP1_B64;
10068
10069 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10070 .add(*SrcOp)
10071 .addImm(BitNo);
10072 Def->eraseFromParent();
10073
10074 return true;
10075 };
10076
10077 switch (CmpInstr.getOpcode()) {
10078 default:
10079 break;
10080 case AMDGPU::S_CMP_EQ_U32:
10081 case AMDGPU::S_CMP_EQ_I32:
10082 case AMDGPU::S_CMPK_EQ_U32:
10083 case AMDGPU::S_CMPK_EQ_I32:
10084 return optimizeCmpAnd(1, 32, true, false);
10085 case AMDGPU::S_CMP_GE_U32:
10086 case AMDGPU::S_CMPK_GE_U32:
10087 return optimizeCmpAnd(1, 32, false, false);
10088 case AMDGPU::S_CMP_GE_I32:
10089 case AMDGPU::S_CMPK_GE_I32:
10090 return optimizeCmpAnd(1, 32, false, true);
10091 case AMDGPU::S_CMP_EQ_U64:
10092 return optimizeCmpAnd(1, 64, true, false);
10093 case AMDGPU::S_CMP_LG_U32:
10094 case AMDGPU::S_CMP_LG_I32:
10095 case AMDGPU::S_CMPK_LG_U32:
10096 case AMDGPU::S_CMPK_LG_I32:
10097 return optimizeCmpAnd(0, 32, true, false);
10098 case AMDGPU::S_CMP_GT_U32:
10099 case AMDGPU::S_CMPK_GT_U32:
10100 return optimizeCmpAnd(0, 32, false, false);
10101 case AMDGPU::S_CMP_GT_I32:
10102 case AMDGPU::S_CMPK_GT_I32:
10103 return optimizeCmpAnd(0, 32, false, true);
10104 case AMDGPU::S_CMP_LG_U64:
10105 return optimizeCmpAnd(0, 64, true, false);
10106 }
10107
10108 return false;
10109}
10110
10112 unsigned OpName) const {
10113 if (!ST.needsAlignedVGPRs())
10114 return;
10115
10116 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10117 if (OpNo < 0)
10118 return;
10119 MachineOperand &Op = MI.getOperand(OpNo);
10120 if (getOpSize(MI, OpNo) > 4)
10121 return;
10122
10123 // Add implicit aligned super-reg to force alignment on the data operand.
10124 const DebugLoc &DL = MI.getDebugLoc();
10125 MachineBasicBlock *BB = MI.getParent();
10127 Register DataReg = Op.getReg();
10128 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10129 Register Undef = MRI.createVirtualRegister(
10130 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10131 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10132 Register NewVR =
10133 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10134 : &AMDGPU::VReg_64_Align2RegClass);
10135 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10136 .addReg(DataReg, 0, Op.getSubReg())
10137 .addImm(AMDGPU::sub0)
10138 .addReg(Undef)
10139 .addImm(AMDGPU::sub1);
10140 Op.setReg(NewVR);
10141 Op.setSubReg(AMDGPU::sub0);
10142 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10143}
10144
10146 if (isIGLP(*MI))
10147 return false;
10148
10150}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isUndef(const MachineInstr &MI)
unsigned const TargetRegisterInfo * TRI
TargetInstrInfo::RegSubRegPair RegSubRegPair
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillRestoreOpcode(unsigned Size)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillSaveOpcode(unsigned Size)
static bool resultDependsOnExec(const MachineInstr &MI)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:82
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static constexpr unsigned ModifierOpNames[]
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:73
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:171
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:759
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:763
bool hasScalarCompareEq64() const
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:401
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasVALUMaskWriteHazard() const
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:313
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:775
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:694
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:767
bool hasVALUReadSGPRHazard() const
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:354
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasNoF16PseudoScalarTransInlineConstants() const
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:946
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:687
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:542
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:612
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:622
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:222
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:444
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:438
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:248
unsigned short Opcode
Definition: MCInstrDesc.h:205
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:565
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:97
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:91
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:398
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:47
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:71
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:577
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:349
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:580
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
iterator_range< mop_iterator > explicit_operands()
Definition: MachineInstr.h:699
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:823
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:808
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:790
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:501
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:707
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:587
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:394
int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:115
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:801
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:563
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:513
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:933
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1174
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
Definition: SIInstrInfo.h:974
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:645
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:553
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1306
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:545
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:658
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:417
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:505
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:521
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:613
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:529
void removeModOperands(MachineInstr &MI) const
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:597
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:457
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:579
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:637
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:605
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:433
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:473
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:981
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:537
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:627
static bool isF16PseudoScalarTrans(unsigned Opcode)
Definition: SIInstrInfo.h:958
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:769
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void materializeImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, int64_t Value) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:725
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
static bool isWWMRegSpillOpcode(uint16_t Opcode)
Definition: SIInstrInfo.h:757
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:1027
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:589
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, const MachineOperand *fromMO, unsigned toIdx, const MachineOperand *toMO) const
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:690
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:872
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:737
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:818
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:465
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:621
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:425
bool isBarrier(unsigned Opcode) const
Definition: SIInstrInfo.h:948
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1319
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:889
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:571
unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:497
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:63
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
bool isVectorRegister(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:65
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:237
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1594
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isDPALU_DPP(const MCInstrDesc &OpDesc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1595
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1597
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2INT32
Definition: SIDefines.h:227
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_AC_V2FP16
Definition: SIDefines.h:246
@ OPERAND_REG_IMM_INT32
Operands with register or 32-bit immediate.
Definition: SIDefines.h:200
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16_DEFERRED
Definition: SIDefines.h:207
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:206
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_AC_BF16
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_AC_INT16
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:204
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_AC_FP16
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_AC_INT32
Definition: SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:242
@ OPERAND_REG_INLINE_AC_V2BF16
Definition: SIDefines.h:245
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:213
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:203
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:251
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:243
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:202
@ OPERAND_REG_INLINE_C_V2FP32
Definition: SIDefines.h:228
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_FP32_DEFERRED
Definition: SIDefines.h:209
@ OPERAND_REG_IMM_FP16_DEFERRED
Definition: SIDefines.h:208
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:470
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:472
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:469
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:471
@ TI_CONSTDATA_START
Definition: AMDGPU.h:468
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1596
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:70
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:68
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:67
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:69
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:60
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:59
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:66
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:71
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:556
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:297
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:43
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:341
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:155
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:160
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:404
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:261
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition: SIInstrInfo.h:39
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Error
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:220
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:210
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:83
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:51
MachineInstr * top() const
Definition: SIInstrInfo.h:56
bool empty() const
Definition: SIInstrInfo.h:66
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:75
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.